Compare commits

..

1 Commits

Author SHA1 Message Date
327acf913d clone from 6640eda 2025-03-12 17:21:50 -07:00
394 changed files with 5078 additions and 13158 deletions

View File

@ -105,6 +105,7 @@ case "$image" in
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=11
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
@ -118,6 +119,7 @@ case "$image" in
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
@ -132,6 +134,7 @@ case "$image" in
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
@ -146,6 +149,7 @@ case "$image" in
ANACONDA_PYTHON_VERSION=3.13
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
@ -160,6 +164,7 @@ case "$image" in
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
@ -173,6 +178,7 @@ case "$image" in
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
@ -187,6 +193,7 @@ case "$image" in
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
@ -201,6 +208,7 @@ case "$image" in
ANACONDA_PYTHON_VERSION=3.13
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
@ -215,6 +223,7 @@ case "$image" in
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
@ -226,6 +235,7 @@ case "$image" in
ANACONDA_PYTHON_VERSION=3.9
CLANG_VERSION=10
PROTOBUF=yes
DB=yes
VISION=yes
CONDA_CMAKE=yes
ONNX=yes
@ -234,6 +244,7 @@ case "$image" in
ANACONDA_PYTHON_VERSION=3.9
CLANG_VERSION=10
PROTOBUF=yes
DB=yes
VISION=yes
VULKAN_SDK_VERSION=1.2.162.1
SWIFTSHADER=yes
@ -244,6 +255,7 @@ case "$image" in
ANACONDA_PYTHON_VERSION=3.11
CLANG_VERSION=10
PROTOBUF=yes
DB=yes
VISION=yes
VULKAN_SDK_VERSION=1.2.162.1
SWIFTSHADER=yes
@ -254,6 +266,7 @@ case "$image" in
ANACONDA_PYTHON_VERSION=3.9
GCC_VERSION=9
PROTOBUF=yes
DB=yes
VISION=yes
CONDA_CMAKE=yes
TRITON=yes
@ -262,6 +275,7 @@ case "$image" in
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=11
PROTOBUF=yes
DB=yes
VISION=yes
ROCM_VERSION=6.2.4
NINJA_VERSION=1.9.0
@ -276,6 +290,7 @@ case "$image" in
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=11
PROTOBUF=yes
DB=yes
VISION=yes
ROCM_VERSION=6.3
NINJA_VERSION=1.9.0
@ -290,6 +305,7 @@ case "$image" in
ANACONDA_PYTHON_VERSION=3.9
GCC_VERSION=11
PROTOBUF=yes
DB=yes
VISION=yes
XPU_VERSION=0.5
NINJA_VERSION=1.9.0
@ -300,6 +316,7 @@ case "$image" in
ANACONDA_PYTHON_VERSION=3.9
GCC_VERSION=11
PROTOBUF=yes
DB=yes
VISION=yes
XPU_VERSION=2025.0
NINJA_VERSION=1.9.0
@ -310,6 +327,7 @@ case "$image" in
ANACONDA_PYTHON_VERSION=3.9
GCC_VERSION=11
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
CONDA_CMAKE=yes
@ -323,6 +341,7 @@ case "$image" in
CUDNN_VERSION=9
CLANG_VERSION=12
PROTOBUF=yes
DB=yes
VISION=yes
TRITON=yes
;;
@ -330,6 +349,7 @@ case "$image" in
ANACONDA_PYTHON_VERSION=3.9
CLANG_VERSION=12
PROTOBUF=yes
DB=yes
VISION=yes
CONDA_CMAKE=yes
TRITON=yes
@ -350,6 +370,7 @@ case "$image" in
ANACONDA_PYTHON_VERSION=3.9
GCC_VERSION=11
PROTOBUF=yes
DB=yes
VISION=yes
KATEX=yes
CONDA_CMAKE=yes
@ -395,6 +416,7 @@ case "$image" in
GCC_VERSION=11
ACL=yes
PROTOBUF=yes
DB=yes
VISION=yes
CONDA_CMAKE=yes
# snadampal: skipping llvm src build install because the current version
@ -406,6 +428,7 @@ case "$image" in
GCC_VERSION=11
ACL=yes
PROTOBUF=yes
DB=yes
VISION=yes
CONDA_CMAKE=yes
# snadampal: skipping llvm src build install because the current version
@ -416,6 +439,7 @@ case "$image" in
*)
# Catch-all for builds that are not hardcoded.
PROTOBUF=yes
DB=yes
VISION=yes
echo "image '$image' did not match an existing build configuration"
if [[ "$image" == *py* ]]; then
@ -471,6 +495,7 @@ docker build \
--build-arg "BUILD_ENVIRONMENT=${image}" \
--build-arg "PROTOBUF=${PROTOBUF:-}" \
--build-arg "LLVMDEV=${LLVMDEV:-}" \
--build-arg "DB=${DB:-}" \
--build-arg "VISION=${VISION:-}" \
--build-arg "UBUNTU_VERSION=${UBUNTU_VERSION}" \
--build-arg "CENTOS_VERSION=${CENTOS_VERSION}" \

View File

@ -55,6 +55,13 @@ RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
RUN rm install_protobuf.sh
ENV INSTALLED_PROTOBUF ${PROTOBUF}
# (optional) Install database packages like LMDB and LevelDB
ARG DB
COPY ./common/install_db.sh install_db.sh
RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
RUN rm install_db.sh
ENV INSTALLED_DB ${DB}
# (optional) Install vision packages like OpenCV
ARG VISION
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./

View File

@ -1 +1 @@
v2.26.2-1
v2.25.1-1

View File

@ -240,7 +240,7 @@ function prune_126 {
}
function install_128 {
CUDNN_VERSION=9.8.0.87
CUDNN_VERSION=9.7.1.26
echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
rm -rf /usr/local/cuda-12.8 /usr/local/cuda
# install CUDA 12.8.0 in the same container

View File

@ -161,7 +161,7 @@ function prune_126 {
}
function install_128 {
CUDNN_VERSION=9.8.0.87
CUDNN_VERSION=9.7.1.26
echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
rm -rf /usr/local/cuda-12.8 /usr/local/cuda
# install CUDA 12.8.0 in the same container

View File

@ -5,7 +5,7 @@ if [[ -n "${CUDNN_VERSION}" ]]; then
mkdir tmp_cudnn
pushd tmp_cudnn
if [[ ${CUDA_VERSION:0:4} == "12.8" ]]; then
CUDNN_NAME="cudnn-linux-x86_64-9.8.0.87_cuda12-archive"
CUDNN_NAME="cudnn-linux-x86_64-9.7.1.26_cuda12-archive"
elif [[ ${CUDA_VERSION:0:4} == "12.6" ]]; then
CUDNN_NAME="cudnn-linux-x86_64-9.5.1.17_cuda12-archive"
elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then

38
.ci/docker/common/install_db.sh Executable file
View File

@ -0,0 +1,38 @@
#!/bin/bash
set -ex
install_ubuntu() {
apt-get update
# Cleanup
apt-get autoclean && apt-get clean
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
}
install_centos() {
# Need EPEL for many packages we depend on.
# See http://fedoraproject.org/wiki/EPEL
yum --enablerepo=extras install -y epel-release
# Cleanup
yum clean all
rm -rf /var/cache/yum
rm -rf /var/lib/yum/yumdb
rm -rf /var/lib/yum/history
}
# Install base packages depending on the base OS
ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
case "$ID" in
ubuntu)
install_ubuntu
;;
centos)
install_centos
;;
*)
echo "Unable to determine OS..."
exit 1
;;
esac

View File

@ -25,9 +25,7 @@ python3 -m pip install meson ninja
###########################
### clone repo
###########################
# TEMPORARY FIX: https://gitlab.freedesktop.org/mesa/drm.git is down until 2025/03/22
# GIT_SSL_NO_VERIFY=true git clone https://gitlab.freedesktop.org/mesa/drm.git
GIT_SSL_NO_VERIFY=true git clone git://anongit.freedesktop.org/mesa/drm
GIT_SSL_NO_VERIFY=true git clone https://gitlab.freedesktop.org/mesa/drm.git
pushd drm
###########################

View File

@ -41,14 +41,11 @@ fbscribelogger==0.1.7
#Pinned versions: 0.1.6
#test that import:
flatbuffers==2.0 ; platform_machine != "s390x"
flatbuffers==2.0
#Description: cross platform serialization library
#Pinned versions: 2.0
#test that import:
flatbuffers ; platform_machine == "s390x"
#Description: cross platform serialization library; Newer version is required on s390x for new python version
hypothesis==5.35.1
# Pin hypothesis to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136
#Description: advanced library for generating parametrized tests
@ -105,10 +102,10 @@ networkx==2.8.8
#Pinned versions: 2.8.8
#test that import: functorch
ninja==1.11.1.3
#Description: build system. Used in some tests. Used in build to generate build
#time tracing information
#Pinned versions: 1.11.1.3
#ninja
#Description: build system. Note that it install from
#here breaks things so it is commented out
#Pinned versions: 1.10.0.post1
#test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
numba==0.49.0 ; python_version < "3.9"
@ -368,6 +365,7 @@ PyYAML
pyzstd
setuptools
ninja==1.11.1 ; platform_machine == "aarch64"
scons==4.5.2 ; platform_machine == "aarch64"
pulp==2.9.0 ; python_version >= "3.8"

View File

@ -50,6 +50,13 @@ RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
RUN rm install_protobuf.sh
ENV INSTALLED_PROTOBUF ${PROTOBUF}
# (optional) Install database packages like LMDB and LevelDB
ARG DB
COPY ./common/install_db.sh install_db.sh
RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
RUN rm install_db.sh
ENV INSTALLED_DB ${DB}
# (optional) Install vision packages like OpenCV
ARG VISION
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./

View File

@ -50,6 +50,13 @@ RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
RUN rm install_protobuf.sh
ENV INSTALLED_PROTOBUF ${PROTOBUF}
# (optional) Install database packages like LMDB and LevelDB
ARG DB
COPY ./common/install_db.sh install_db.sh
RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
RUN rm install_db.sh
ENV INSTALLED_DB ${DB}
# (optional) Install vision packages like OpenCV
ARG VISION
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./

View File

@ -77,6 +77,13 @@ COPY triton_version.txt triton_version.txt
RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
RUN rm install_triton.sh common_utils.sh triton-xpu.txt triton_version.txt
# (optional) Install database packages like LMDB and LevelDB
ARG DB
COPY ./common/install_db.sh install_db.sh
RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
RUN rm install_db.sh
ENV INSTALLED_DB ${DB}
# (optional) Install vision packages like OpenCV
ARG VISION
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./

View File

@ -74,6 +74,13 @@ RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
RUN rm install_protobuf.sh
ENV INSTALLED_PROTOBUF ${PROTOBUF}
# (optional) Install database packages like LMDB and LevelDB
ARG DB
COPY ./common/install_db.sh install_db.sh
RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
RUN rm install_db.sh
ENV INSTALLED_DB ${DB}
# (optional) Install vision packages like OpenCV
ARG VISION
COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./

View File

@ -73,14 +73,26 @@ fi
# Check GCC ABI
###############################################################################
# NOTE: As of https://github.com/pytorch/pytorch/issues/126551 we only produce
# wheels with cxx11-abi
# NOTE [ Building libtorch with old vs. new gcc ABI ]
#
# Packages built with one version of ABI could not be linked against by client
# C++ libraries that were compiled using the other version of ABI. Since both
# gcc ABIs are still common in the wild, we need to support both ABIs. Currently:
#
# - All the nightlies built on CentOS 7 + devtoolset7 use the old gcc ABI.
# - All the nightlies built on Ubuntu 16.04 + gcc 5.4 use the new gcc ABI.
echo "Checking that the gcc ABI is what we expect"
if [[ "$(uname)" != 'Darwin' ]]; then
function is_expected() {
if [[ "$1" -gt 0 || "$1" == "ON " ]]; then
echo 1
if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* || "$DESIRED_CUDA" == *"rocm"* ]]; then
if [[ "$1" -gt 0 || "$1" == "ON " ]]; then
echo 1
fi
else
if [[ -z "$1" || "$1" == 0 || "$1" == "OFF" ]]; then
echo 1
fi
fi
}

View File

@ -121,9 +121,9 @@ def main() -> None:
else:
install_root = Path(distutils.sysconfig.get_python_lib()) / "torch"
libtorch_cpu_path = str(install_root / "lib" / "libtorch_cpu.so")
# NOTE: All binaries are built with cxx11abi now
check_lib_symbols_for_abi_correctness(libtorch_cpu_path, False)
libtorch_cpu_path = install_root / "lib" / "libtorch_cpu.so"
pre_cxx11_abi = "cxx11-abi" not in os.getenv("DESIRED_DEVTOOLSET", "")
check_lib_symbols_for_abi_correctness(libtorch_cpu_path, pre_cxx11_abi)
if __name__ == "__main__":

View File

@ -76,13 +76,10 @@ def read_release_matrix():
def test_numpy():
try:
import numpy as np
import numpy as np
x = np.arange(5)
torch.tensor(x)
except ImportError:
print("Numpy check skipped. Numpy is not installed.")
x = np.arange(5)
torch.tensor(x)
def check_version(package: str) -> None:
@ -413,7 +410,6 @@ def main() -> None:
smoke_test_conv2d()
test_linalg()
test_numpy()
if is_cuda_system:
test_linalg("cuda")
test_cuda_gds_errors_captured()

View File

@ -1619,7 +1619,6 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
install_torchvision
checkout_install_torchbench hf_T5 llama moco
PYTHONPATH=$(pwd)/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
test_inductor_aoti
elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
install_torchvision
test_inductor_shard "${SHARD_NUMBER}"

View File

@ -55,16 +55,12 @@ s3_upload() {
s3_upload_dir="${s3_root_dir}/${UPLOAD_SUBFOLDER}/"
fi
(
cache_control_flag=""
if [[ "${UPLOAD_CHANNEL}" = "test" ]]; then
cache_control_flag="--cache-control='no-cache,no-store,must-revalidate'"
fi
for pkg in ${PKG_DIR}/*.${extension}; do
(
set -x
shm_id=$(sha256sum "${pkg}" | awk '{print $1}')
${AWS_S3_CP} --no-progress --acl public-read "${pkg}" "${s3_upload_dir}" \
--metadata "checksum-sha256=${shm_id}" ${cache_control_flag}
--metadata "checksum-sha256=${shm_id}"
)
done
)

View File

@ -3,11 +3,8 @@ self-hosted-runner:
# GitHub hosted runner that actionlint doesn't recognize because actionlint version (1.6.21) is too old
- ubuntu-24.04
# GitHub hosted x86 Linux runners
# TODO: Cleanup mentions of linux.20_04 when upgrade to linux.24_04 is complete
- linux.20_04.4x
- linux.20_04.16x
- linux.24_04.4x
- linux.24_04.16x
# Organization-wide AWS Linux Runners
- linux.large
- linux.2xlarge
@ -52,7 +49,6 @@ self-hosted-runner:
- linux.rocm.gpu
- linux.rocm.gpu.2
- linux.rocm.gpu.4
- rocm-docker
# Repo-specific Apple hosted runners
- macos-m1-ultra
- macos-m2-14

View File

@ -23,44 +23,9 @@ runs:
id: check_container_runner
run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"
- name: Set up parallel fetch and clean workspace
id: first-clean
continue-on-error: true
- name: Clean workspace
shell: bash
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
env:
NO_SUDO: ${{ inputs.no-sudo }}
run: |
# Use all available CPUs for fetching
cd "${GITHUB_WORKSPACE}"
git config --global fetch.parallel 0
git config --global submodule.fetchJobs 0
# Clean workspace. The default checkout action should also do this, but
# do it here as well just in case
if [[ -d .git ]]; then
if [ -z "${NO_SUDO}" ]; then
sudo git clean -ffdx
else
git clean -ffdx
fi
fi
- name: Checkout PyTorch
id: first-checkout-attempt
continue-on-error: true
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
# --depth=1 for speed, manually fetch history and other refs as necessary
fetch-depth: ${{ inputs.fetch-depth }}
submodules: ${{ inputs.submodules }}
show-progress: false
- name: Clean workspace (try again)
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' &&
(steps.first-clean.outcome != 'success' || steps.first-checkout-attempt.outcome != 'success') }}
shell: bash
env:
NO_SUDO: ${{ inputs.no-sudo }}
run: |
@ -75,11 +40,16 @@ runs:
fi
mkdir "${GITHUB_WORKSPACE}"
- name: Checkout PyTorch (try again)
# Use all available CPUs for fetching
cd "${GITHUB_WORKSPACE}"
git config --global fetch.parallel 0
git config --global submodule.fetchJobs 0
- name: Checkout PyTorch
uses: actions/checkout@v4
if: ${{ steps.first-clean.outcome != 'success' || steps.first-checkout-attempt.outcome != 'success' }}
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
# --depth=1 for speed, manually fetch history and other refs as necessary
fetch-depth: ${{ inputs.fetch-depth }}
submodules: ${{ inputs.submodules }}
show-progress: false

View File

@ -68,7 +68,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'"
@ -77,14 +77,14 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'"

View File

@ -1,97 +0,0 @@
#!/usr/bin/env python3
import os
import re
import sys
sys.path.insert(1, os.path.join(sys.path[0], "..", "..", ".."))
from tools.testing.discover_tests import TESTS
skip_list = [
# these tests fail due to various reasons
"dynamo/test_misc",
"inductor/test_aot_inductor",
"inductor/test_cpu_repro",
"inductor/test_cpu_select_algorithm",
"inductor/test_aot_inductor_arrayref",
"inductor/test_torchinductor_codegen_dynamic_shapes",
"lazy/test_meta_kernel",
"onnx/test_utility_funs",
"profiler/test_profiler",
"test_ao_sparsity",
"test_cpp_extensions_open_device_registration",
"test_jit",
"test_metal",
"test_mps",
"dynamo/test_torchrec",
"inductor/test_aot_inductor_utils",
"inductor/test_coordinate_descent_tuner",
"test_jiterator",
# these tests run long and fail in addition to that
"dynamo/test_dynamic_shapes",
"test_quantization",
"inductor/test_torchinductor",
"inductor/test_torchinductor_dynamic_shapes",
"inductor/test_torchinductor_opinfo",
"test_binary_ufuncs",
"test_unary_ufuncs",
# these tests fail when cuda is not available
"inductor/test_cudacodecache",
"inductor/test_inductor_utils",
"inductor/test_inplacing_pass",
"inductor/test_kernel_benchmark",
"inductor/test_max_autotune",
"inductor/test_move_constructors_to_cuda",
"inductor/test_multi_kernel",
"inductor/test_pattern_matcher",
"inductor/test_perf",
"inductor/test_select_algorithm",
"inductor/test_snode_runtime",
"inductor/test_triton_wrapper",
# these tests fail when mkldnn is not available
"inductor/test_custom_post_grad_passes",
"inductor/test_mkldnn_pattern_matcher",
# lacks quantization support
"onnx/test_models_quantized_onnxruntime",
"onnx/test_pytorch_onnx_onnxruntime",
# https://github.com/pytorch/pytorch/issues/102078
"test_decomp",
# https://github.com/pytorch/pytorch/issues/146698
"test_model_exports_to_core_aten",
# runs very long, skip for now
"inductor/test_layout_optim",
"test_fx",
# some false errors
"doctests",
]
skip_list_regex = [
# distributed tests fail randomly
"distributed/.*",
]
all_testfiles = sorted(TESTS)
filtered_testfiles = []
for filename in all_testfiles:
if filename in skip_list:
continue
regex_filtered = False
for regex_string in skip_list_regex:
if re.fullmatch(regex_string, filename):
regex_filtered = True
break
if regex_filtered:
continue
filtered_testfiles.append(filename)
for filename in filtered_testfiles:
print(' "' + filename + '",')

View File

@ -819,9 +819,10 @@ class GitHubPR:
cursor=info["reviews"]["pageInfo"]["startCursor"],
)
info = rc["data"]["repository"]["pullRequest"]
reviews = {
author: state for author, state in self._reviews if state != "COMMENTED"
}
reviews = {}
for author, state in self._reviews:
if state != "COMMENTED":
reviews[author] = state
return list(reviews.items())
def get_approved_by(self) -> list[str]:
@ -2281,8 +2282,7 @@ def merge(
except MandatoryChecksMissingError as ex:
last_exception = str(ex)
print(
f"Merge of https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num} failed due to: {ex}. Retrying in 5 min",
flush=True,
f"Merge of https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num} failed due to: {ex}. Retrying in 5 min"
)
time.sleep(5 * 60)
# Finally report timeout back

View File

@ -33,6 +33,10 @@ on:
default: "3.9"
description: |
The python version to be used. Will be 3.9 by default
environment-file:
required: false
type: string
description: Set the conda environment file used to setup macOS build.
test-matrix:
required: false
type: string
@ -82,12 +86,23 @@ jobs:
fi
- name: Setup miniconda
if: inputs.environment-file == ''
uses: pytorch/test-infra/.github/actions/setup-miniconda@main
with:
python-version: ${{ inputs.python-version }}
environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt
# This option is used when cross-compiling arm64 from x86-64. Specifically, we need arm64 conda
# environment even though the arch is x86-64
- name: Setup miniconda using the provided environment file
if: inputs.environment-file != ''
uses: pytorch/test-infra/.github/actions/setup-miniconda@main
with:
python-version: ${{ inputs.python-version }}
environment-file: ${{ inputs.environment-file }}
pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt
- name: Install sccache (only for non-forked PRs, and pushes to trunk)
uses: nick-fields/retry@v3.0.0
if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}

View File

@ -35,7 +35,7 @@ jobs:
pull-requests: write
name: Check labels
if: github.repository_owner == 'pytorch'
runs-on: linux.24_04.4x
runs-on: linux.20_04.4x
steps:
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main

View File

@ -1,55 +0,0 @@
name: docker-cache-mi300
on:
# run every 6 hours
schedule:
- cron: 0 0,6,12,18 * * *
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }}
cancel-in-progress: true
permissions:
id-token: write
contents: read
jobs:
docker-cache:
if: github.repository_owner == 'pytorch'
runs-on: rocm-docker
steps:
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
with:
no-sudo: true
- name: configure aws credentials
id: aws_creds
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
aws-region: us-east-1
role-duration-seconds: 18000
- name: Login to Amazon ECR
id: login-ecr
continue-on-error: false
uses: aws-actions/amazon-ecr-login@v2
- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
with:
docker-image-name: pytorch-linux-focal-rocm-n-py3
push: false
- name: Pull docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
- name: Tar and upload to S3 bucket
run: |
sudo docker save -o ~/docker-data/pytorch/pytorch_docker_image.tar ${{ steps.calculate-docker-image.outputs.docker-image }}
sudo rclone copy -P --s3-upload-concurrency 64 --s3-chunk-size 200M --s3-upload-cutoff 300M ~/docker-data/pytorch/pytorch_docker_image.tar oci:pytorchbucket0002/pytorch_docker_image --progress

View File

@ -64,7 +64,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_9-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cpu-aarch64-test: # Testing
@ -134,7 +134,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_9-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -181,7 +181,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cpu-aarch64-test: # Testing
@ -251,7 +251,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -298,7 +298,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cpu-aarch64-test: # Testing
@ -368,7 +368,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -415,7 +415,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cpu-aarch64-test: # Testing
@ -485,7 +485,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -532,7 +532,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13-cpu-aarch64-test: # Testing
@ -602,7 +602,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
@ -649,7 +649,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13t-cpu-aarch64-test: # Testing
@ -719,7 +719,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_13t-cuda-aarch64-12_8
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
timeout-minutes: 420
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}

View File

@ -105,7 +105,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_9-cuda12_6
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cuda12_6-test: # Testing
@ -152,7 +152,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_9-cuda12_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cuda12_8-test: # Testing

View File

@ -262,7 +262,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_9-cuda12_6
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cuda12_6-test: # Testing
@ -331,7 +331,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_9-cuda12_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cuda12_8-test: # Testing
@ -891,7 +891,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_10-cuda12_6
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda12_6-test: # Testing
@ -960,7 +960,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_10-cuda12_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda12_8-test: # Testing
@ -1520,7 +1520,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_11-cuda12_6
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda12_6-test: # Testing
@ -1654,7 +1654,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_11-cuda12_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda12_8-test: # Testing
@ -2214,7 +2214,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_12-cuda12_6
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda12_6-test: # Testing
@ -2283,7 +2283,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_12-cuda12_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda12_8-test: # Testing
@ -2843,7 +2843,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13-cuda12_6
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13-cuda12_6-test: # Testing
@ -2912,7 +2912,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13-cuda12_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13-cuda12_8-test: # Testing
@ -3472,7 +3472,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13t-cuda12_6
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13t-cuda12_6-test: # Testing
@ -3541,7 +3541,7 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build_name: manywheel-py3_13t-cuda12_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13t-cuda12_8-test: # Testing

View File

@ -63,7 +63,7 @@ jobs:
timeout-minutes: 420
build_name: manywheel-py3_9-cpu-s390x
build_environment: linux-s390x-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cpu-s390x-test: # Testing
@ -128,7 +128,7 @@ jobs:
timeout-minutes: 420
build_name: manywheel-py3_10-cpu-s390x
build_environment: linux-s390x-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cpu-s390x-test: # Testing
@ -193,7 +193,7 @@ jobs:
timeout-minutes: 420
build_name: manywheel-py3_11-cpu-s390x
build_environment: linux-s390x-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cpu-s390x-test: # Testing
@ -258,7 +258,7 @@ jobs:
timeout-minutes: 420
build_name: manywheel-py3_12-cpu-s390x
build_environment: linux-s390x-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cpu-s390x-test: # Testing
@ -323,7 +323,7 @@ jobs:
timeout-minutes: 420
build_name: manywheel-py3_13-cpu-s390x
build_environment: linux-s390x-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_13-cpu-s390x-test: # Testing

View File

@ -43,7 +43,7 @@ jobs:
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.9"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
@ -167,7 +167,7 @@ jobs:
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.10"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
@ -291,7 +291,7 @@ jobs:
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.11"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
@ -415,7 +415,7 @@ jobs:
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.12"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
@ -539,7 +539,7 @@ jobs:
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.13"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the
@ -663,7 +663,7 @@ jobs:
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.13t"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the

View File

@ -54,7 +54,7 @@ jobs:
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.12"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
# NOTE: These environment variables are put here so that they can be applied on every job equally
# They are also here because setting them at a workflow level doesn't give us access to the

View File

@ -54,7 +54,7 @@ jobs:
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.9"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -290,7 +290,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.9"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -528,7 +528,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.9"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -766,7 +766,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.9"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -1238,7 +1238,7 @@ jobs:
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.10"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -1474,7 +1474,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.10"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -1712,7 +1712,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.10"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -1950,7 +1950,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.10"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -2422,7 +2422,7 @@ jobs:
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.11"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -2658,7 +2658,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.11"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -2896,7 +2896,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.11"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -3134,7 +3134,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.11"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -3606,7 +3606,7 @@ jobs:
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.12"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -3842,7 +3842,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.12"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -4080,7 +4080,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.12"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -4318,7 +4318,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.12"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -4790,7 +4790,7 @@ jobs:
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.13"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -5026,7 +5026,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.13"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -5264,7 +5264,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.13"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -5502,7 +5502,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.13"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -5974,7 +5974,7 @@ jobs:
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.13t"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -6210,7 +6210,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.13t"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -6448,7 +6448,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.13t"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -6686,7 +6686,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.13t"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash

View File

@ -26,7 +26,7 @@ jobs:
curr_branch: ${{ github.head_ref || github.ref_name }}
lintrunner-clang:
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
needs: get-label-type
with:
timeout: 120
@ -43,7 +43,7 @@ jobs:
.github/scripts/lintrunner.sh
lintrunner-noclang:
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
needs: get-label-type
with:
timeout: 120
@ -59,7 +59,7 @@ jobs:
.github/scripts/lintrunner.sh
quick-checks:
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
needs: get-label-type
with:
timeout: 120
@ -116,7 +116,7 @@ jobs:
bash .github/scripts/pr-sanity-check.sh
workflow-checks:
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
needs: get-label-type
with:
timeout: 120
@ -154,7 +154,7 @@ jobs:
exit $RC
toc:
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
needs: get-label-type
with:
timeout: 120
@ -194,7 +194,7 @@ jobs:
test-tools:
name: Test tools
if: ${{ github.repository == 'pytorch/pytorch' }}
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
needs: get-label-type
with:
timeout: 120
@ -215,7 +215,7 @@ jobs:
test_run_test:
name: Test `run_test.py` is usable without boto3
if: ${{ github.repository == 'pytorch/pytorch' }}
runs-on: linux.24_04.4x
runs-on: linux.20_04.4x
steps:
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
@ -241,18 +241,10 @@ jobs:
test_collect_env:
if: ${{ github.repository == 'pytorch/pytorch' }}
name: Test collect_env
runs-on: ${{ matrix.runner }}
runs-on: linux.20_04.4x
strategy:
matrix:
include:
- test_type: with_torch
runner: linux.24_04.4x
- test_type: without_torch
runner: linux.24_04.4x
# NOTE: The oldest supported version of python for 24.04 is 3.8
# so this cannot be updated if we want to keep this test at 3.6
- test_type: older_python_version
runner: linux.20_04.4x
test_type: [with_torch, without_torch, older_python_version]
steps:
# [see note: pytorch repo ref]
# deep clone (fetch-depth 0) required, to allow us to use git log

View File

@ -7,7 +7,7 @@ on:
jobs:
do_revert:
name: try_revert_pr_${{ github.event.client_payload.pr_num }}
runs-on: linux.24_04.4x
runs-on: linux.20_04.4x
environment: mergebot
env:
GH_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}

View File

@ -15,7 +15,7 @@ jobs:
check_binary_linux_cpu:
if: github.repository_owner == 'pytorch'
name: Test check_binary.sh for Linux CPU
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
with:
docker-image: python:3.11
docker-build-dir: "skip-docker-build"
@ -28,7 +28,7 @@ jobs:
check_binary_linux_cuda:
if: github.repository_owner == 'pytorch'
name: Test check_binary.sh for Linux CUDA
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
with:
runner: linux.4xlarge.nvidia.gpu
docker-image: python:3.11

View File

@ -7,7 +7,7 @@ on:
jobs:
do_merge:
name: try_merge_pr_${{ github.event.client_payload.pr_num }}
runs-on: linux.24_04.4x
runs-on: linux.20_04.4x
environment: mergebot
permissions:
id-token: write

View File

@ -19,7 +19,7 @@
- [Cherry Picking Fixes](#cherry-picking-fixes)
- [How to do Cherry Picking](#how-to-do-cherry-picking)
- [Cherry Picking Reverts](#cherry-picking-reverts)
- [Preparing and Creating Final Release Candidate](#preparing-and-creating-final-release-candidate)
- [Preparing and Creating Final Release candidate](#preparing-and-creating-final-release-candidate)
- [Promoting RCs to Stable](#promoting-rcs-to-stable)
- [Additional Steps to prepare for release day](#additional-steps-to-prepare-for-release-day)
- [Modify release matrix](#modify-release-matrix)
@ -63,7 +63,7 @@ Following is the Release Compatibility Matrix for PyTorch releases:
## Release Cadence
Following is the release cadence. All future dates below are tentative. For latest updates on the release schedule, please follow [dev discuss](https://dev-discuss.pytorch.org/c/release-announcements/27). Please note: Patch Releases are optional.
Following is the release cadence. All future dates below are tentative, for latest updates on the release scheduled please follow [dev discuss](https://dev-discuss.pytorch.org/c/release-announcements/27). Please note: Patch Releases are optional.
| Minor Version | Release branch cut | Release date | First patch release date | Second patch release date|
| --- | --- | --- | --- | --- |
@ -91,20 +91,20 @@ Releasing a new version of PyTorch generally entails 3 major steps:
### Frequently Asked Questions
* Q: What is a release branch cut ?
* Q: What is release branch cut ?
* A: When bulk of the tracked features merged into the main branch, the primary release engineer starts the release process of cutting the release branch by creating a new git branch based off of the current `main` development branch of PyTorch. This allows PyTorch development flow on `main` to continue uninterrupted, while the release engineering team focuses on stabilizing the release branch in order to release a series of release candidates (RC). The activities in the release branch include both regression and performance testing as well as polishing new features and fixing release-specific bugs. In general, new features *are not* added to the release branch after it was created.
* Q: What is a cherry-pick ?
* Q: What is cherry-pick ?
* A: A cherry pick is a process of propagating commits from the main into the release branch, utilizing git's built in [cherry-pick feature](https://git-scm.com/docs/git-cherry-pick). These commits are typically limited to small fixes or documentation updates to ensure that the release engineering team has sufficient time to complete a thorough round of testing on the release branch. To nominate a fix for cherry-picking, a separate pull request must be created against the respective release branch and then mentioned in the Release Tracker issue (example: https://github.com/pytorch/pytorch/issues/94937) following the template from the issue description. The comment nominating a particular cherry-pick for inclusion in the release should include the committed PR against main branch, the newly created cherry-pick PR, as well as the acceptance criteria for why the cherry-pick is needed in the first place.
## Cutting a release branch preparations
Following requirements need to be met prior to cutting a release branch:
Following Requirements needs to be met prior to cutting a release branch:
* Resolve all outstanding issues in the milestones (for example [1.11.0](https://github.com/pytorch/pytorch/milestone/28)) before first RC cut is completed. After RC cut is completed, the following script should be executed from test-infra repo in order to validate the presence of the fixes in the release branch:
* Resolve all outstanding issues in the milestones(for example [1.11.0](https://github.com/pytorch/pytorch/milestone/28))before first RC cut is completed. After RC cut is completed following script should be executed from test-infra repo in order to validate the presence of the fixes in the release branch :
``` python github_analyze.py --repo-path ~/local/pytorch --remote upstream --branch release/1.11 --milestone-id 26 --missing-in-branch ```
* Validate that all new workflows have been created in the PyTorch and domain libraries included in the release. Validate it against all dimensions of release matrix, including operating systems (Linux, MacOS, Windows), Python versions as well as CPU architectures (x86 and arm) and accelerator versions (CUDA, ROCm, XPU).
* All the nightly jobs for pytorch and domain libraries should be green. Validate this using the following HUD links:
* Validate that all new workflows have been created in the PyTorch and domain libraries included in the release. Validate it against all dimensions of release matrix, including operating systems(Linux, MacOS, Windows), Python versions as well as CPU architectures(x86 and arm) and accelerator versions(CUDA, ROCm, XPU).
* All the nightly jobs for pytorch and domain libraries should be green. Validate this using following HUD links:
* [Pytorch](https://hud.pytorch.org/hud/pytorch/pytorch/nightly)
* [TorchVision](https://hud.pytorch.org/hud/pytorch/vision/nightly)
* [TorchAudio](https://hud.pytorch.org/hud/pytorch/audio/nightly)
@ -224,12 +224,12 @@ Backups are stored in a non-public S3 bucket at [`s3://pytorch-backup`](https://
### Release Candidate health validation
Validate that the release jobs for pytorch and domain libraries are green. Validate this using the following HUD links:
Validate the release jobs for pytorch and domain libraries should be green. Validate this using following HUD links:
* [Pytorch](https://hud.pytorch.org/hud/pytorch/pytorch/release%2F1.12)
* [TorchVision](https://hud.pytorch.org/hud/pytorch/vision/release%2F1.12)
* [TorchAudio](https://hud.pytorch.org/hud/pytorch/audio/release%2F1.12)
Validate that the documentation build has completed and generated an entry corresponding to the release in the [docs repository](https://github.com/pytorch/docs/tree/main/).
Validate that the documentation build has completed and generated entry corresponding to the release in [docs repository](https://github.com/pytorch/docs/tree/main/).
### Cherry Picking Fixes
@ -274,15 +274,15 @@ requires `pytorchbot`, so it's only available in PyTorch atm.
### Cherry Picking Reverts
If a PR that has been cherry-picked into the release branch has been reverted, its cherry-pick must be reverted as well.
If PR that has been cherry-picked into release branch has been reverted, its cherry-pick must be reverted as well.
Reverts for changes that were committed into the main branch prior to the branch cut must be propagated into the release branch as well.
Reverts for changes that was committed into the main branch prior to the branch cut, must be propagated into release branch as well.
## Preparing and Creating Final Release Candidate
## Preparing and Creating Final Release candidate
The following requirements need to be met prior to creating the final Release Candidate:
The following requirements need to be met prior to creating final Release Candidate :
* Resolve all outstanding open issues in the milestone. There should be no open issues/PRs (for example [2.1.2](https://github.com/pytorch/pytorch/milestone/39)). Each issue should either be closed or de-milestoned.
* Resolve all outstanding open issues in the milestone. There should be no open issues/PRs (for example [2.1.2](https://github.com/pytorch/pytorch/milestone/39)). The issue should either be closed or de-milestoned.
* Validate that all closed milestone PRs are present in the release branch. Confirm this by running:
``` python github_analyze.py --repo-path ~/local/pytorch --remote upstream --branch release/2.2 --milestone-id 40 --missing-in-branch ```
@ -291,7 +291,7 @@ The following requirements need to be met prior to creating the final Release Ca
* Perform [Release Candidate health validation](#release-candidate-health-validation). CI should have the green signal.
After the final RC is created, the following tasks should be performed:
After the final RC is created. The following tasks should be performed :
* Perform [Release Candidate health validation](#release-candidate-health-validation). CI should have the green signal.
@ -323,25 +323,25 @@ Promotion should occur in two steps:
## Additional Steps to prepare for release day
The following should be prepared for the release day:
The following should be prepared for the release day
### Modify release matrix
Modify the release matrix for the get started page. See the following [PR](https://github.com/pytorch/test-infra/pull/4611) as reference.
Need to modify release matrix for get started page. See following [PR](https://github.com/pytorch/test-infra/pull/4611) as reference.
The PR to update published_versions.json and quick-start-module.js is auto generated. See the following [PR](https://github.com/pytorch/pytorch.github.io/pull/1467) as reference.
The PR to update published_versions.json and quick-start-module.js is auto generated. See following [PR](https://github.com/pytorch/pytorch.github.io/pull/1467) as reference.
Please note: This PR needs to be merged on the release day and hence it should be absolutely free of any failures. To test this PR, open another test PR pointing to the Release Candidate location as described in the [Release Candidate Storage](#release-candidate-storage) section.
Please note: This PR needs to be merged on the release day and hence it should be absolutely free of any failures. To test this PR, open another test PR but pointing to the Release candidate location as above [Release Candidate Storage](RELEASE.md#release-candidate-storage)
### Open Google Colab issue
This is normally done right after the release is completed. We need to create a Google Colab issue. See the following example [issue](https://github.com/googlecolab/colabtools/issues/2372)
This is normally done right after the release is completed. We would need to create Google Colab Issue see following [PR](https://github.com/googlecolab/colabtools/issues/2372)
# Patch Releases
A patch release is a maintenance release of PyTorch that includes fixes for regressions found in a previous minor release. Patch releases typically will bump the `patch` version from semver (i.e. `[major].[minor].[patch]`).
Please note: Starting from 2.1, one can expect up to 2 patch releases after every minor release. Patch releases are only published for the latest minor release.
Please note: Starting from 2.1 one can expect up to 2 patch releases after every minor ones. Patch releases would only be published for latest minor release.
## Patch Release Criteria
@ -363,29 +363,29 @@ Patch releases should be considered if a regression meets the following criteria
> Main POC: Patch Release Managers, Triage Reviewers
Patch releases should follow these high-level phases. This process starts immediately after the previous release has completed.
The patch release process takes around 4-5 weeks to complete.
Patch release process takes around 4-5 weeks to complete.
1. Triage is a process where issues are identified, graded, compared to Patch Release Criteria and added to Patch Release milestone. This process normally takes 2 weeks after the release completion.
1. Triage, is a process where issues are identified, graded, compared to Patch Release Criteria and added to Patch Release milestone. This process normally takes 2 weeks after the release completion.
2. Go/No Go meeting between PyTorch Releng, PyTorch Core and Project Managers where potential issues triggering a release in milestones are reviewed, and following decisions are made:
* Should the new patch release be created?
* Should the new patch Release be created ?
* Timeline execution for the patch release
3. Cherry picking phase starts after the decision is made to create a patch release. At this point, a new release tracker for the patch release is created, and an announcement will be made on official channels [example announcement](https://dev-discuss.pytorch.org/t/pytorch-release-2-0-1-important-information/1176). The authors of the fixes to regressions will be asked to create their own cherry picks. This process normally takes 2 weeks.
4. Building Binaries, Promotion to Stable and testing. After all cherry picks have been merged, Release Managers trigger a new build and produce a new release candidate. An announcement is made on the official channel about the RC availability at this point. This process normally takes 2 weeks.
3. Cherry picking phase starts after the decision is made to create patch release. At this point a new release tracker for the patch release is created, and an announcement will be made on official channels [example announcement](https://dev-discuss.pytorch.org/t/pytorch-release-2-0-1-important-information/1176). The authors of the fixes to regressions will be asked to create their own cherry picks. This process normally takes 2 weeks.
4. Building Binaries, Promotion to Stable and testing. After all cherry picks have been merged, Release Managers trigger new build and produce new release candidate. Announcement is made on the official channel about the RC availability at this point. This process normally takes 2 weeks.
5. General Availability
### Triage
> Main POC: Triage Reviewers
1. Tag issues/pull requests that are candidates for a potential patch release with `triage review`
1. Tag issues / pull requests that are candidates for a potential patch release with `triage review`
* ![adding triage review label](https://user-images.githubusercontent.com/1700823/132589089-a9210a14-6159-409d-95e5-f79067f6fa38.png)
2. Triage reviewers will then check if the regression/fix identified fits within the above mentioned [Patch Release Criteria](#patch-release-criteria)
3. Triage reviewers will then add the issue/pull request to the related milestone (i.e. `1.9.1`) if the regression is found to be within the [Patch Release Criteria](#patch-release-criteria)
2. Triage reviewers will then check if the regression / fix identified fits within above mentioned [Patch Release Criteria](#patch-release-criteria)
3. Triage reviewers will then add the issue / pull request to the related milestone (i.e. `1.9.1`) if the regressions is found to be within the [Patch Release Criteria](#patch-release-criteria)
* ![adding to milestone](https://user-images.githubusercontent.com/1700823/131175980-148ff38d-44c3-4611-8a1f-cd2fd1f4c49d.png)
### Issue Tracker for Patch releases
For patch releases, an issue tracker needs to be created. For a patch release, we require all cherry-pick changes to have links to either a high-priority GitHub issue or a CI failure from previous RC. An example of this would look like:
For patch releases issue tracker needs to be created. For patch release, we require all cherry-pick changes to have links to either a high-priority GitHub issue or a CI failure from previous RC. An example of this would look like:
* https://github.com/pytorch/pytorch/issues/128436
Only following issues are accepted:

View File

@ -343,32 +343,9 @@ if(USE_CUDA)
endif()
if(USE_ROCM)
# NOTE: The PyTorch build does not actually add_subdirectory
# third_party/composable_kernel or use it as a CMake library. What is used
# is header only, so this should be ok, except that the CMake build generates
# a ck/config.h. We just do that part here. Without this, the ck.h from the
# ROCM SDK may get accidentally used instead.
function(_pytorch_rocm_generate_ck_conf)
set(CK_ENABLE_INT8 "ON")
set(CK_ENABLE_FP16 "ON")
set(CK_ENABLE_FP32 "ON")
set(CK_ENABLE_FP64 "ON")
set(CK_ENABLE_BF16 "ON")
set(CK_ENABLE_FP8 "ON")
set(CK_ENABLE_BF8 "ON")
set(CK_USE_XDL "ON")
set(CK_USE_WMMA "ON")
configure_file(
"${Torch_SOURCE_DIR}/third_party/composable_kernel/include/ck/config.h.in"
"${CMAKE_CURRENT_BINARY_DIR}/composable_kernel/ck/config.h"
)
endfunction()
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip)
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include)
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include)
list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel)
_pytorch_rocm_generate_ck_conf()
# Next two lines are needed because TunableOp uses third-party/fmt
list(APPEND ATen_HIP_INCLUDE $<TARGET_PROPERTY:fmt::fmt-header-only,INTERFACE_INCLUDE_DIRECTORIES>)
list(APPEND ATen_HIP_DEPENDENCY_LIBS fmt::fmt-header-only)

View File

@ -110,11 +110,6 @@ class TORCH_API Context {
Allocator* getPinnedMemoryAllocator(
std::optional<c10::DeviceType> device_type = std::nullopt) {
auto opt_device_type =
device_type.has_value() ? device_type : at::getAccelerator();
if (opt_device_type) {
lazyInitDevice(opt_device_type.value());
}
return getAcceleratorHooksInterface(device_type).getPinnedMemoryAllocator();
}

View File

@ -28,8 +28,10 @@ c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) {
opt_device_type = at::getAccelerator(false);
}
if (opt_device_type.has_value()) {
return at::globalContext().getPinnedMemoryAllocator(
opt_device_type.value());
at::globalContext().lazyInitDevice(opt_device_type.value());
return at::globalContext()
.getAcceleratorHooksInterface(opt_device_type)
.getPinnedMemoryAllocator();
} else {
TORCH_CHECK(
false, "Need to provide pin_memory allocator to use pin memory.")

View File

@ -64,7 +64,7 @@ thread_local std::array<at::ScalarType, at::COMPILE_TIME_MAX_DEVICE_TYPES>
at::ScalarType::Undefined, // IDEEP.
at::kHalf, // AMD HIP
at::ScalarType::Undefined, // FPGA
at::kBFloat16, // ONNX Runtime / Microsoft
at::ScalarType::Undefined, // ONNX Runtime / Microsoft
at::kBFloat16, // XLA / TPU
at::ScalarType::Undefined, // Vulkan
at::ScalarType::Undefined, // Metal
@ -500,44 +500,6 @@ TORCH_LIBRARY_IMPL(aten, AutocastMTIA, m) {
TORCH_FN((&at::autocast::binary_cross_entropy_banned)));
}
// MAIA
TORCH_LIBRARY_IMPL(_, AutocastMAIA, m) {
m.fallback(torch::CppFunction::makeFallthrough());
}
TORCH_LIBRARY_IMPL(aten, AutocastMAIA, m) {
// lower_precision_fp
#define _KERNEL_MAIA_LOW_PRECISION_FP(...) \
KERNEL_MAIA(__VA_ARGS__, lower_precision_fp)
AT_FORALL_LOWER_PRECISION_FP(_KERNEL_MAIA_LOW_PRECISION_FP)
// fp32
#define _KERNEL_MAIA_FP32(...) KERNEL_MAIA(__VA_ARGS__, fp32)
AT_FORALL_FP32(_KERNEL_MAIA_FP32)
// fp32_set_opt_dtype
#define _KERNEL_MAIA_FP32_SET_OPT_DTYPE(...) \
KERNEL_MAIA(__VA_ARGS__, fp32_set_opt_dtype)
AT_FORALL_FP32_SET_OPT_DTYPE(_KERNEL_MAIA_FP32_SET_OPT_DTYPE)
// fp32_append_dtype
// The fp32_append_dtype wrapper overrides implicit promotion behavior.
// norm does not implicitly promote, but be aware when adding new ops to this policy.
AT_FORALL_DIFFERENT_REDISPATCH_SIGNATURE(
KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_MAIA)
// promote
#define _KERNEL_MAIA_PROMOTE(...) KERNEL_MAIA(__VA_ARGS__, promote)
AT_FORALL_PROMOTE(_KERNEL_MAIA_PROMOTE)
m.impl(TORCH_SELECTIVE_NAME("aten::binary_cross_entropy"),
TORCH_FN((&at::autocast::binary_cross_entropy_banned)));
}
// XPU
TORCH_LIBRARY_IMPL(_, AutocastXPU, m) {
m.fallback(torch::CppFunction::makeFallthrough());

View File

@ -126,11 +126,10 @@ TORCH_API inline void set_autocast_gpu_dtype(at::ScalarType dtype) {
// NOLINTNEXTLINE(misc-use-internal-linkage)
AT_FORALL_DEPRECATED_AUTOCAST_BACKENDS(DECLARE_DEPRECATED_AUTOCAST_APIS)
const std::array<at::DeviceType, 10> _AUTOCAST_SUPPORTED_DEVICES{
const std::array<at::DeviceType, 9> _AUTOCAST_SUPPORTED_DEVICES{
at::kCPU,
at::kCUDA,
at::kMTIA,
at::kMAIA,
at::kXPU,
at::kIPU,
at::kHPU,
@ -151,8 +150,6 @@ inline bool is_autocast_eligible(
tensor.is_floating_point();
case c10::DeviceType::MTIA:
return tensor.is_mtia() && tensor.is_floating_point();
case c10::DeviceType::MAIA:
return tensor.is_maia() && tensor.is_floating_point();
case c10::DeviceType::XPU:
return tensor.is_xpu() && tensor.is_floating_point();
case c10::DeviceType::IPU:
@ -180,8 +177,6 @@ inline DispatchKey get_autocast_dispatch_key_from_device_type(
return DispatchKey::AutocastCPU;
case c10::DeviceType::MTIA:
return DispatchKey::AutocastMTIA;
case c10::DeviceType::MAIA:
return DispatchKey::AutocastMAIA;
case c10::DeviceType::XPU:
return DispatchKey::AutocastXPU;
case c10::DeviceType::IPU:
@ -753,24 +748,6 @@ copy pasted in from VariableTypeEverything.cpp with appropriate substitutions.
REDISPATCH_SIGNATURE, \
POLICY)
// KERNEL_MAIA/KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_MAIA
// registration (OP, POLICY) or (OP, OVERLOAD, POLICY) for AutocastMAIA
#define KERNEL_MAIA(...) KERNEL(c10::DeviceType::MAIA, __VA_ARGS__)
#define KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_MAIA( \
REDISPATCH_FUNC, \
REGISTER_NAME, \
REGISTER_SIGNATURE, \
REDISPATCH_SIGNATURE, \
POLICY) \
KERNEL_DIFFERENT_REDISPATCH_SIGNATURE( \
c10::DeviceType::MAIA, \
REDISPATCH_FUNC, \
REGISTER_NAME, \
REGISTER_SIGNATURE, \
REDISPATCH_SIGNATURE, \
POLICY)
// KERNEL_XPU/KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_XPU
// registration (OP, POLICY) or (OP, OVERLOAD, POLICY) for AutocastXPU
#define KERNEL_XPU(...) KERNEL(c10::DeviceType::XPU, __VA_ARGS__)

View File

@ -80,10 +80,6 @@ TORCH_LIBRARY_IMPL(_, AutogradMTIA, m) {
m.fallback(AUTOGRAD_FALLBACK);
}
TORCH_LIBRARY_IMPL(_, AutogradMAIA, m) {
m.fallback(AUTOGRAD_FALLBACK);
}
TORCH_LIBRARY_IMPL(_, AutogradXLA, m) {
m.fallback(AUTOGRAD_FALLBACK);
}

View File

@ -1079,13 +1079,7 @@ void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
}
#ifdef USE_ROCM
else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
auto dprops = at::cuda::getCurrentDeviceProperties();
c10::string_view arch(dprops->gcnArchName);
if (arch == "gfx1100") { //no CK GEMM version for gfx1100
gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
} else{
at::native::gemm_internal_ck<float>(CUDABLAS_GEMM_ARGS(float));
}
at::native::gemm_internal_ck<float>(CUDABLAS_GEMM_ARGS(float));
}
#endif
else {

View File

@ -156,7 +156,6 @@ NVRTC_STUB2(nvrtcGetProgramLogSize,nvrtcProgram, size_t*)
NVRTC_STUB2(nvrtcGetProgramLog, nvrtcProgram, char *)
NVRTC_STUB3(nvrtcGetLoweredName, nvrtcProgram, const char *, const char **)
CUDA_STUB2(cuModuleLoad, CUmodule*, const char*)
CUDA_STUB2(cuModuleLoadData, CUmodule *, const void *)
CUDA_STUB3(cuModuleGetFunction, CUfunction *, CUmodule, const char *)
CUDA_STUB4(cuOccupancyMaxActiveBlocksPerMultiprocessor, int *, CUfunction, int, size_t)
@ -170,8 +169,6 @@ CUDA_STUB4(cuLinkCreate, unsigned int, CUjit_option *, void **, CUlinkState *)
CUDA_STUB3(cuLinkComplete, CUlinkState, void **, size_t *)
CUDA_STUB3(cuFuncSetAttribute, CUfunction, CUfunction_attribute, int)
CUDA_STUB3(cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction)
CUDA_STUB3(cuPointerGetAttribute, void*, CUpointer_attribute, CUdeviceptr)
#if defined(CUDA_VERSION) && CUDA_VERSION >= 12000
CUresult CUDAAPI

View File

@ -43,7 +43,6 @@ namespace at::cuda {
_(nvrtcGetProgramLogSize) \
_(nvrtcGetProgramLog) \
_(nvrtcGetLoweredName) \
_(cuModuleLoad) \
_(cuModuleLoadData) \
_(cuModuleLoadDataEx) \
_(cuModuleGetFunction) \
@ -61,7 +60,6 @@ namespace at::cuda {
_(cuLinkComplete) \
_(cuFuncSetAttribute) \
_(cuFuncGetAttribute) \
_(cuPointerGetAttribute) \
#if defined(CUDA_VERSION) && CUDA_VERSION >= 12000
#define AT_FORALL_NVRTC_EXTENDED(_) \

View File

@ -575,20 +575,11 @@ struct ScaledGemmParams : OpParams {
std::string BLASSignature() const override {
// Excluding use_fast_accum and use_rowise booleans for now
if (bias_ptr == nullptr) {
return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: 0, stride_b: 0, stride_c: 0, stride_d: 0, "
"transA: %c, transB: %c, batch_count: 1, scaleA: f32_r, scaleB: f32_r, a_type: %s, b_type: %s, c_type: %s, d_type: %s, scale_type: %s, compute_type: %s }",
m, n, k, lda, ldb, ldc, ldc, transa, transb,
ScalarTypeToBLASType(a_dtype), ScalarTypeToBLASType(b_dtype), ScalarTypeToBLASType(c_dtype), ScalarTypeToBLASType(c_dtype),
ComputeTypeFor<T>(), ComputeTypeFor<T>());
}
else {
return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: 0, stride_b: 0, stride_c: 0, stride_d: 0, "
"transA: %c, transB: %c, batch_count: 1, scaleA: f32_r, scaleB: f32_r, a_type: %s, b_type: %s, c_type: %s, d_type: %s, bias_type: %s, scale_type: %s, compute_type: %s }",
m, n, k, lda, ldb, ldc, ldc, transa, transb,
ScalarTypeToBLASType(a_dtype), ScalarTypeToBLASType(b_dtype), ScalarTypeToBLASType(c_dtype), ScalarTypeToBLASType(c_dtype), ScalarTypeToBLASType(bias_dtype),
ComputeTypeFor<T>(), ComputeTypeFor<T>());
}
return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: 0, stride_b: 0, stride_c: 0, stride_d: 0, "
"transA: %c, transB: %c, batch_count: 1, scaleA: f32_r, scaleB: f32_r, a_type: %s, b_type: %s, c_type: %s, d_type: %s, bias_type: %s, scale_type: %s, compute_type: %s }",
m, n, k, lda, ldb, ldc, ldc, transa, transb,
ScalarTypeToBLASType(a_dtype), ScalarTypeToBLASType(b_dtype), ScalarTypeToBLASType(c_dtype), ScalarTypeToBLASType(c_dtype), ScalarTypeToBLASType(bias_dtype),
ComputeTypeFor<T>(), ComputeTypeFor<T>());
}
std::string Signature() const override {

View File

@ -498,11 +498,7 @@ class HipblasltGemmOp : public Callable<ParamsT> {
mat_c, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_c, sizeof(stride_c)));
}
hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F;
if (at::globalContext().allowTF32CuBLAS()) {
computeType = HIPBLAS_COMPUTE_32F_FAST_TF32;
}
HipBlasLtMatmulDescriptor matmul(computeType, HIP_R_32F);
HipBlasLtMatmulDescriptor matmul(HIPBLAS_COMPUTE_32F, HIP_R_32F);
matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSA, opa);
matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSB, opb);
@ -615,11 +611,6 @@ auto GetHipBlasLtTypeStringAndOps() {
auto in_out_datatype = HipDataTypeFor<CT>();
std::vector<hipblasLtMatmulHeuristicResult_t> heuristic_result;
hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F;
if (at::globalContext().allowTF32CuBLAS()) {
computeType = HIPBLAS_COMPUTE_32F_FAST_TF32;
}
hipblasLtHandle_t handle;
TORCH_HIPBLASLT_CHECK(hipblasLtCreate(&handle));
TORCH_HIPBLASLT_CHECK(hipblaslt_ext::getAllAlgos(handle,
@ -630,7 +621,7 @@ auto GetHipBlasLtTypeStringAndOps() {
b_datatype,
in_out_datatype,
in_out_datatype,
computeType,
HIPBLAS_COMPUTE_32F,
heuristic_result));
TORCH_HIPBLASLT_CHECK(hipblasLtDestroy(handle));

View File

@ -141,8 +141,6 @@ class RocblasGemmOp : public Callable<GemmParams<T>> {
TuningStatus Call(const GemmParams<T>* params) override {
auto input_output_type = RocBlasDataTypeFor<T>();
if (at::globalContext().allowTF32CuBLAS() && input_output_type == rocblas_datatype_f32_r)
return FAIL; // no support for TF32 in rocBLAS
auto compute_type = RocBlasComputeTypeFor<T>();
auto h_a = DoCastForHalfOrBfloat16(params->alpha);
auto h_b = DoCastForHalfOrBfloat16(params->beta);
@ -209,8 +207,6 @@ class RocblasGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>>
TuningStatus Call(const GemmStridedBatchedParams<T>* params) override {
auto input_output_type = RocBlasDataTypeFor<T>();
if (at::globalContext().allowTF32CuBLAS() && input_output_type == rocblas_datatype_f32_r)
return FAIL; // no support for TF32 in rocBLAS
auto compute_type = RocBlasComputeTypeFor<T>();
auto h_a = DoCastForHalfOrBfloat16(params->alpha);
auto h_b = DoCastForHalfOrBfloat16(params->beta);

View File

@ -322,24 +322,6 @@ void gemm(
const float beta,
at::BFloat16 *c, int64_t ldc) {
internal::normalize_last_dims(transa, transb, m, n, k, &lda, &ldb, &ldc);
#if AT_MKLDNN_ENABLED()
#ifdef __aarch64__
// MKLDNN also supports ARM for bf16, and the bypass is only
// currently intended for x86/x86_64.
const bool use_bf16_gemv_trans = false;
#elif defined(__powerpc__)
const bool use_bf16_gemv_trans = false;
#else
const bool bf16_gemv_trans_would_be_faster = cpuinfo_initialize() &&
!cpuinfo_has_x86_avx512bf16();
const bool use_bf16_gemv_trans = bf16_gemv_trans_would_be_faster &&
transa == TransposeType::Transpose &&
transb == TransposeType::NoTranspose && n == 1 && alpha == 1.0;
#endif
if (!use_bf16_gemv_trans && mkldnn_bf16_gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)) {
return;
}
#endif
#if AT_BUILD_WITH_BLAS() && defined(BLAS_HAS_SBGEMM)
if (use_blas_gemm(transa, transb, m, n, k, lda, ldb, ldc)) {
int m_ = m, n_ = n, k_ = k, lda_ = lda, ldb_ = ldb, ldc_ = ldc;
@ -360,6 +342,24 @@ void gemm(
}
return;
}
#endif
#if AT_MKLDNN_ENABLED()
#ifdef __aarch64__
// MKLDNN also supports ARM for bf16, and the bypass is only
// currently intended for x86/x86_64.
const bool use_bf16_gemv_trans = false;
#elif defined(__powerpc__)
const bool use_bf16_gemv_trans = false;
#else
const bool bf16_gemv_trans_would_be_faster = cpuinfo_initialize() &&
!cpuinfo_has_x86_avx512bf16();
const bool use_bf16_gemv_trans = bf16_gemv_trans_would_be_faster &&
transa == TransposeType::Transpose &&
transb == TransposeType::NoTranspose && n == 1 && alpha == 1.0;
#endif
if (!use_bf16_gemv_trans && mkldnn_bf16_gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)) {
return;
}
#endif
gemm_stub(
at::kCPU, at::kBFloat16,

View File

@ -3610,11 +3610,11 @@ Tensor& transpose_(Tensor& self, int64_t dim0, int64_t dim1) {
return at::_mkldnn_transpose_(self, dim0, dim1);
}
SymDimVector sizes(self.sym_sizes().begin(), self.sym_sizes().end());
std::swap(sizes[dim0], sizes[dim1]);
SymDimVector strides(self.sym_strides().begin(), self.sym_strides().end());
DimVector sizes(self.sizes().begin(), self.sizes().end());
DimVector strides(self.strides().begin(), self.strides().end());
std::swap(strides[dim0], strides[dim1]);
auto result = self.as_strided__symint(std::move(sizes), std::move(strides));
std::swap(sizes[dim0], sizes[dim1]);
self.as_strided_(sizes, strides);
return self;
}

View File

@ -832,9 +832,9 @@ void hardswish_backward_kernel(TensorIterator& iter) {
cpu_kernel_vec(
iter,
[&](scalar_t grad_val, scalar_t self_val) -> scalar_t {
if (float(self_val) <= neg_three) {
if (float(self_val) < neg_three) {
return zero;
} else if (float(self_val) < three) {
} else if (float(self_val) <= three) {
return float(grad_val) * ((float(self_val) / three) + one_half);
} else {
return grad_val;
@ -847,19 +847,19 @@ void hardswish_backward_kernel(TensorIterator& iter) {
Vec::blendv(
grad_val0 * ((self_val0 / kThreeVec) + kOneHalfVec),
grad_val0,
self_val0 >= kThreeVec
self_val0 > kThreeVec
),
kZeroVec,
self_val0 <= kNegThreeVec
self_val0 < kNegThreeVec
);
self_val1 = Vec::blendv(
Vec::blendv(
grad_val1 * ((self_val1 / kThreeVec) + kOneHalfVec),
grad_val1,
self_val1 >= kThreeVec
self_val1 > kThreeVec
),
kZeroVec,
self_val1 <= kNegThreeVec
self_val1 < kNegThreeVec
);
return convert_from_float<scalar_t>(self_val0, self_val1);
});
@ -878,9 +878,9 @@ void hardswish_backward_kernel(TensorIterator& iter) {
cpu_kernel_vec(
iter,
[&](scalar_t grad_val, scalar_t self_val) {
if (self_val <= neg_three) {
if (self_val < neg_three) {
return zero;
} else if (self_val < three) {
} else if (self_val <= three) {
return grad_val * ((self_val / three) + one_half);
} else {
return grad_val;
@ -891,10 +891,10 @@ void hardswish_backward_kernel(TensorIterator& iter) {
Vec::blendv(
grad_val * ((self_val / kThreeVec) + kOneHalfVec),
grad_val,
self_val >= kThreeVec
self_val > kThreeVec
),
kZeroVec,
self_val <= kNegThreeVec
self_val < kNegThreeVec
);
}
);

View File

@ -1,12 +1,5 @@
#pragma once
// On Windows, math.h needs to be included with _USE_MATH_DEFINES defined to
// access constants such as M_SQRT2 and M_2_SQRTPI.
#ifdef _WIN32
#define _USE_MATH_DEFINES
#include <cmath>
#endif // _WIN32
#include <ATen/cpu/vec/vec.h>
#include <c10/util/BFloat16.h> // For c10::is_reduced_floating_point_v.

View File

@ -45,9 +45,9 @@ void hardswish_backward_kernel(TensorIterator& iter) {
[zero, three, neg_three, one_half]GPU_LAMBDA(scalar_t grad_val_, scalar_t self_val_) -> scalar_t {
opmath_t grad_val = static_cast<opmath_t>(grad_val_);
opmath_t self_val = static_cast<opmath_t>(self_val_);
if (self_val <= neg_three) {
if (self_val < neg_three) {
return zero;
} else if (self_val < three) {
} else if (self_val <= three) {
return grad_val * ((self_val / three) + one_half);
} else {
return grad_val;

View File

@ -51,23 +51,6 @@
namespace at::native {
#ifdef USE_ROCM
// Custom configuration for vectorized elementwise kernel
// with template instantiation.
namespace vectorized_templated_config {
constexpr int num_threads() {
return 512;
}
constexpr int elems_per_thread() {
return 32;
}
constexpr int block_work_size() {
return elems_per_thread() * num_threads();
}
} // namespace vectorized_templated_config
#endif
template <typename args_t, size_t... Is>
constexpr auto sum_of_sizes(args_t args, std::index_sequence<Is...>) {
@ -272,139 +255,6 @@ static inline void launch_vectorized_kernel(
}
}
#ifdef USE_ROCM
template <
int vec_size,
typename func_t,
typename array_t,
typename inp_calc_t,
typename out_calc_t,
typename loader_t,
typename storer_t,
typename OutputType,
typename... InputTypes>
C10_LAUNCH_BOUNDS_1(vectorized_templated_config::num_threads())
__global__ void vectorized_templated_elementwise_kernel(
int N,
func_t f,
array_t data,
inp_calc_t inp_calc,
out_calc_t out_calc,
loader_t loader,
storer_t storer) {
int remaining =
N - vectorized_templated_config::block_work_size() * blockIdx.x;
if (remaining <
vectorized_templated_config::block_work_size()) { // if this block handles
// the reminder,
// just do a naive unrolled loop
auto policy = memory::policies::unroll_base<
vectorized_templated_config::num_threads(),
array_t,
inp_calc_t,
out_calc_t,
loader_t,
storer_t,
vectorized_templated_config::elems_per_thread()>(
data, remaining, inp_calc, out_calc, loader, storer);
elementwise_kernel_helper(f, policy);
} else { // if this block has a full `block_work_size` data to handle, use
// vectorized memory access
elementwise_kernel_helper(
f,
memory::policies::vectorized_templated<
vec_size,
array_t,
vectorized_templated_config::elems_per_thread(),
vectorized_templated_config::num_threads(),
OutputType,
InputTypes...>(data));
}
}
// This function assume trivial 1d and supports template specialization
// to avoid dynamic casting.
// Input vectorization size is based on runtime information, i.e.
// the actual data types of the input and output tensor and cannot
// be determined using the functor type, as in regular non-templated
// vectorized kernels. The caller is in charge of selecting the correct input
// vectorization length.
template <
typename func_t,
typename array_t,
typename inp_calc_t,
typename out_calc_t,
typename loader_t,
typename storer_t,
typename OutputType,
typename... InputTypes>
static inline void launch_vectorized_templated_kernel(
int64_t N,
const func_t& f,
array_t data,
inp_calc_t ic,
out_calc_t oc,
loader_t l,
storer_t s) {
TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
using traits = function_traits<func_t>;
int64_t grid = (N + vectorized_templated_config::block_work_size() - 1) /
vectorized_templated_config::block_work_size();
auto stream = at::cuda::getCurrentCUDAStream();
int vec_size = memory::can_vectorize_up_to<func_t>(data);
switch (vec_size) {
case 8:
vectorized_templated_elementwise_kernel<
8,
func_t,
array_t,
inp_calc_t,
out_calc_t,
loader_t,
storer_t,
OutputType,
InputTypes...>
<<<grid, vectorized_templated_config::num_threads(), 0, stream>>>(
N, f, data, ic, oc, l, s);
C10_CUDA_KERNEL_LAUNCH_CHECK();
break;
case 4:
vectorized_templated_elementwise_kernel<
4,
func_t,
array_t,
inp_calc_t,
out_calc_t,
loader_t,
storer_t,
OutputType,
InputTypes...>
<<<grid, vectorized_templated_config::num_threads(), 0, stream>>>(
N, f, data, ic, oc, l, s);
C10_CUDA_KERNEL_LAUNCH_CHECK();
break;
case 2:
vectorized_templated_elementwise_kernel<
2,
func_t,
array_t,
inp_calc_t,
out_calc_t,
loader_t,
storer_t,
OutputType,
InputTypes...>
<<<grid, vectorized_templated_config::num_threads(), 0, stream>>>(
N, f, data, ic, oc, l, s);
C10_CUDA_KERNEL_LAUNCH_CHECK();
break;
default:
// vector size 1 is not handled as part of vectorize_templated kernel
TORCH_INTERNAL_ASSERT(false, "Unexpected vectorization size");
}
}
#endif
template <
typename func_t,
typename array_t,
@ -542,46 +392,6 @@ void gpu_kernel_impl_nocast(TensorIteratorBase& iter, const func_t& f) {
});
}
#ifdef USE_ROCM
namespace {
template <typename TupleLike, size_t arity, size_t arg_num = 0>
struct check_types {
constexpr static inline bool check() {
if constexpr (arity != 2)
return false;
if constexpr (arg_num == 0) {
using SelectedType = std::tuple_element_t<arg_num, TupleLike>;
if constexpr (std::is_same_v<float, SelectedType>)
return check_types<TupleLike, arity, arg_num + 1>::check();
} else if constexpr (arg_num == 1) {
using SelectedType2 = std::tuple_element_t<arg_num, TupleLike>;
if constexpr (std::is_same_v<float, SelectedType2>)
return check_types<TupleLike, arity, arg_num + 1>::check();
}
return false;
}
};
// Bottom case: if we got this far, assume correct type matching except
// when there are no arguments (arity == 0).
template <typename TupleLike, size_t arity>
struct check_types<TupleLike, arity, arity> {
constexpr static inline bool check() {
if constexpr (arity != 0)
return true;
return false;
}
};
template <typename TupleLike>
struct check_types<TupleLike, 0, 0> {
constexpr static inline bool check() {
return false;
}
};
} // namespace
#endif
template <typename func_t>
void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
if (!needs_dynamic_casting<func_t>::check(iter)) {
@ -606,45 +416,6 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
if (contiguous) {
#ifdef USE_ROCM
// Attempt to call specialized vectorized elementwise kernel
// that enables interleaving.
using float_map = c10::CppTypeToScalarType<float>;
using bfloat16_map = c10::CppTypeToScalarType<BFloat16>;
if (iter.ninputs() == 2 && iter.input_dtype(0) == float_map::value &&
iter.input_dtype(1) == bfloat16_map::value &&
memory::can_vectorize_up_to<func_t>(data) > 1) {
// constexpr to reduce the amount of kernels (empty) generated for
// vectorized templated elementwise and limit which functors are actually
// applied to the load and store at compile time.
using func_tuple = typename traits::ArgsTuple;
if constexpr (
std::is_same_v<float, arg0_t> && traits::arity == 2 &&
check_types<func_tuple, traits::arity, 0>::check()) {
auto input_offset_calculator = TrivialOffsetCalculator<traits::arity>();
auto output_offset_calculator = TrivialOffsetCalculator<1>();
auto loader = memory::LoadWithCast<traits::arity>(iter);
auto storer = memory::StoreWithCast<1>(iter);
launch_vectorized_templated_kernel<
func_t,
std::array<char*, ntensors>,
decltype(input_offset_calculator),
decltype(output_offset_calculator),
decltype(loader),
decltype(storer),
float,
float,
BFloat16>(
numel,
f,
data,
input_offset_calculator,
output_offset_calculator,
loader,
storer);
return;
}
}
std::array<ScalarType, ntensors> dtypes;
auto inner_strides = iter.get_inner_strides();
std::array<int, ntensors> strides;

View File

@ -67,28 +67,6 @@ struct vectorized_load_helper {
}
};
#ifdef USE_ROCM
// Templated version of vectorized load helper.
// It can be used on heterogeneous input tensor element types.
template <int arg_index>
struct vectorized_templated_load_helper {
template <typename args_t, typename policy_t>
static __device__ void apply(policy_t& self, args_t* args, int idx) {
using arg_t = std::tuple_element_t<arg_index, args_t>;
// `data` hold the data_ptr for tensors [output, input0, input1, ...], so we
// need a +1 offset to get the input
// Delay pointer arithmetic to the policy loader where we know the actual
// type of the current argument.
char* ptr = (self.data[arg_index + 1]);
auto args_accessor = [&args] __device__(int thread_unroll_idx) -> arg_t& {
return std::get<arg_index>(args[thread_unroll_idx]);
};
self.template load_single_arg<arg_index>(args_accessor, ptr, idx);
}
};
#endif
template<int arg_index>
struct unroll_load_helper {
template <typename args_t, typename policy_t, typename offset_t, typename loader_t>
@ -203,16 +181,9 @@ __device__ aligned_vector<bool, vec_size> load_vector(const bool *base_ptr, uint
namespace policies {
template <
int num_threads,
typename data_t,
typename inp_calc_t,
typename out_calc_t,
typename loader_t,
typename storer_t,
int elems_per_thread,
int num_outputs = 1>
struct unroll_base {
template<typename data_t, typename inp_calc_t, typename out_calc_t, typename loader_t, typename storer_t, int elems_per_thread, int num_outputs=1>
struct unroll {
data_t data;
int remaining;
inp_calc_t input_offset_calculator;
@ -220,24 +191,12 @@ struct unroll_base {
loader_t loader;
storer_t storer;
static constexpr int tws = elems_per_thread;
static constexpr int block_work_size = elems_per_thread * num_threads;
__device__ unroll_base(
data_t data,
int remaining,
inp_calc_t ic,
out_calc_t oc,
loader_t l,
storer_t s)
: data(data),
remaining(remaining),
input_offset_calculator(ic),
output_offset_calculator(oc),
loader(l),
storer(s) {}
__device__ unroll(data_t data, int remaining, inp_calc_t ic, out_calc_t oc, loader_t l, storer_t s):
data(data), remaining(remaining), input_offset_calculator(ic), output_offset_calculator(oc), loader(l), storer(s) {}
__device__ inline bool check_inbounds(int thread_work_elem) {
return ((int)(threadIdx.x + thread_work_elem * num_threads) < remaining);
return ((int)(threadIdx.x + thread_work_elem*num_threads()) < remaining);
}
template<typename args_t>
@ -246,13 +205,13 @@ struct unroll_base {
int thread_idx = threadIdx.x;
#pragma unroll
for (int i = 0; i < elems_per_thread; i++) {
if (thread_idx < remaining) {
int linear_idx = thread_idx + block_work_size * idx;
auto offset = input_offset_calculator.get(linear_idx);
detail::static_unroll<detail::unroll_load_helper, arity>::with_args(
*this, args, offset, loader, i, num_outputs);
thread_idx += num_threads;
if (thread_idx >= remaining) {
return;
}
int linear_idx = thread_idx + elems_per_thread * num_threads() * idx;
auto offset = input_offset_calculator.get(linear_idx);
detail::static_unroll<detail::unroll_load_helper, arity>::with_args(*this, args, offset, loader, i, num_outputs);
thread_idx += num_threads();
}
}
@ -261,36 +220,22 @@ struct unroll_base {
int thread_idx = threadIdx.x;
#pragma unroll
for (int i = 0; i < elems_per_thread; i++) {
if (thread_idx < remaining) {
int linear_idx = thread_idx + block_work_size * idx;
int offset = output_offset_calculator.get(linear_idx)[0];
storer.store(from[i], data[0], offset);
thread_idx += num_threads;
if (thread_idx >= remaining) {
return;
}
int linear_idx = thread_idx + elems_per_thread * num_threads() * idx;
int offset = output_offset_calculator.get(linear_idx)[0];
storer.store(from[i], data[0], offset);
thread_idx += num_threads();
}
}
};
// Utility type for all users of unroll that extract the num_threads value from
// the caller scope.
template <
typename data_t,
typename inp_calc_t,
typename out_calc_t,
typename loader_t,
typename storer_t,
int elems_per_thread,
int num_outputs = 1>
using unroll = unroll_base<
num_threads(),
data_t,
inp_calc_t,
out_calc_t,
loader_t,
storer_t,
elems_per_thread,
num_outputs>;
// Assumption:
// all tensors are contiguous, that is: stride == sizeof(type) for all tensors
// Note:
// Functions in vectorized policy does not do boundary check. It assumes the whole block
// has its job to do. So the reminders should be handled by the caller manually.
template <int vec_size, typename data_t, int elems_per_thread> // vec_size: number of scalars, can be 1, 2, or 4.
struct vectorized {
@ -344,86 +289,6 @@ struct vectorized {
}
};
#ifdef USE_ROCM
// This is similar to vectorized policy above, but this one supports
// heterogenous input tensor types as templated parameters.
// Its use should be limited to frequently used heterogeneous data types
// as each instantiation will generate a separate kernel, leading to code
// bloating if applied to all combinations supported in PyTorch. Assumption: all
// tensors are contiguous, that is: stride == sizeof(type) for all tensors.
template <
int vec_size,
typename data_t,
int elems_per_thread,
int num_threads,
typename CastToT,
typename... CastFromTs> // vec_size: number of scalars, can be 1, 2, or 4.
struct vectorized_templated {
static_assert(
elems_per_thread % vec_size == 0,
"The workload per thread must be a multiple of vec_size");
static constexpr int loop_size = elems_per_thread / vec_size;
static constexpr int tws = elems_per_thread;
static constexpr int block_work_size = elems_per_thread * num_threads;
data_t data;
__device__ vectorized_templated(data_t data) : data(data) {}
__device__ inline constexpr bool check_inbounds(int thread_work_elem) {
return true;
}
template <int arg_index, typename accessor_t>
__device__ inline void load_single_arg(accessor_t to, char* ptr, int idx) {
// extract the arg_index-th input tensor element type from the
// variadic template argument.
using CastFromT =
std::tuple_element_t<arg_index, std::tuple<CastFromTs...>>;
// Delayed pointer arithmetic from the caller: this is the place
// where we know the type of the argument.
CastFromT* block_ptr =
reinterpret_cast<CastFromT*>(ptr) + block_work_size * idx;
int thread_idx = threadIdx.x;
#pragma unroll
for (int i = 0; i < loop_size; i++) {
int index = thread_idx + i * num_threads;
auto v = load_vector<vec_size>(block_ptr, index);
#pragma unroll
for (int j = 0; j < vec_size; j++) {
to(vec_size * i + j) = c10::convert<CastToT>(v.val[j]);
}
}
}
template <typename args_t>
__device__ inline void load(args_t* args, int idx) {
constexpr int arity = std::tuple_size<args_t>::value;
detail::static_unroll<detail::vectorized_templated_load_helper, arity>::
with_args(*this, args, idx);
}
// Assume for now that from (temporary array per thread) is of the same
// type as to (destination tensor), which is the case for
// float(float,bfloat16) and functor add on float(float,float).
template <typename scalar_t>
__device__ inline void store(scalar_t* from, int idx) {
using vec_t = aligned_vector<scalar_t, vec_size>;
scalar_t* to = reinterpret_cast<scalar_t*>(data[0]) + block_work_size * idx;
vec_t* to_ = reinterpret_cast<vec_t*>(to);
int thread_idx = threadIdx.x;
#pragma unroll
for (int i = 0; i < loop_size; i++) {
int index = thread_idx + i * num_threads;
vec_t v;
for (int j = 0; j < vec_size; j++) {
v.val[j] = from[vec_size * i + j];
}
to_[index] = v;
}
}
};
#endif
template <typename data_t, typename inp_calc_t, typename out_calc_t, int num_outputs>
struct multi_outputs_unroll {
//multi_outputs_unroll struct members and check_inbounds and load methods are copypasted from unroll struct

View File

@ -89,20 +89,6 @@ struct SoftMaxBackwardEpilogue {
const AccumT sum;
};
template<typename T, typename AccumT, typename OutT>
struct SoftMaxForwardWithMulEpilogue {
__device__ __forceinline__ SoftMaxForwardWithMulEpilogue(AccumT max_input, AccumT sum)
: max_input(max_input)
, sum(sum) {}
__device__ __forceinline__ OutT operator()(T input) const {
return static_cast<OutT>(__expf(input - max_input) * sum);
}
const AccumT max_input;
const AccumT sum;
};
@ -401,19 +387,6 @@ struct SumExpFloat
const AccumT max_k;
};
template<typename T, typename AccumT>
struct SumExpfFloat
{
__device__ __forceinline__ SumExpfFloat(AccumT v)
: max_k(v) {}
__device__ __forceinline__ AccumT operator()(AccumT sum, T v) const {
return sum + __expf(v - max_k);
}
const AccumT max_k;
};
template <template<typename> class Reduction, typename AccumT>
__device__ __forceinline__ AccumT
blockReduce(AccumT* smem, AccumT val,
@ -476,19 +449,6 @@ T blockReduceWarp(T* smem_cache, T value, const Reduction<T>& op, T defaultVal)
return smem_cache[0];
}
template <template<typename> class Reduction, typename T>
__device__ __forceinline__
T blockReduceWarpInverse(T* smem_cache, T value, const Reduction<T>& op, T defaultVal)
{
T result = cuda_utils::BlockReduce<T, Reduction<T>>(value, op, defaultVal, smem_cache);
if (threadIdx.x == 0) {
smem_cache[0] = 1 / result;
}
__syncthreads();
return smem_cache[0];
}
template <template<typename, typename> class Reduction, int ILP, typename T, typename AccumT, typename index_t=int>
__device__ __forceinline__ AccumT
ilpReduce(index_t shift,
@ -704,38 +664,6 @@ WriteBpropResults(
}
}
template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t, template <typename, typename, typename> class EpilogueWithMul>
__global__ void
cunn_SoftMaxForwardFast(outscalar_t *output, const scalar_t *input, int classes)
{
extern __shared__ unsigned char smem[];
auto sdata = reinterpret_cast<accscalar_t*>(smem);
// each block handles a sample in the mini-batch
input += static_cast<int64_t>(blockIdx.x) * classes;
output += static_cast<int64_t>(blockIdx.x) * classes;
const int shift = ((uint64_t)input) % ALIGN_BYTES / sizeof(scalar_t);
// find the max
accscalar_t threadMax = ilpReduce<MaxFloat, ILP, scalar_t, accscalar_t>(
shift, input, classes, MaxFloat<scalar_t, accscalar_t>(), -at::numeric_limits<accscalar_t>::max());
accscalar_t max_k = blockReduceWarp<Max, accscalar_t>(sdata, threadMax,
Max<accscalar_t>(), -at::numeric_limits<accscalar_t>::max());
// reduce all values
accscalar_t threadExp = ilpReduce<SumExpfFloat, ILP, scalar_t, accscalar_t>(
shift, input, classes, SumExpfFloat<scalar_t, accscalar_t>(max_k), static_cast<accscalar_t>(0));
accscalar_t sumAll = blockReduceWarpInverse<Add, accscalar_t>(sdata, threadExp,
Add<accscalar_t>(), static_cast<accscalar_t>(0));
EpilogueWithMul<scalar_t, accscalar_t, outscalar_t> epilogue(max_k, sumAll);
for (int offset = threadIdx.x; offset < classes; offset += blockDim.x) {
output[offset] = epilogue(input[offset]);
}
}
template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t, template <typename, typename, typename> class Epilogue>
__global__ void
cunn_SoftMaxForward(outscalar_t *output, const scalar_t *input, int classes)
@ -827,68 +755,6 @@ cunn_SoftMaxForwardReg(outscalar_t *output, const scalar_t *input, index_t class
}
}
template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t,
template <typename, typename, typename> class EpilogueWithMul, typename index_t = int32_t>
__global__ void
cunn_SoftMaxForwardGmem(outscalar_t *output, const scalar_t *input, index_t classes)
{
// Each thread block processes a sample in the batch
input += static_cast<int64_t>(blockIdx.x) * classes;
output += static_cast<int64_t>(blockIdx.x) * classes;
accscalar_t threadMax = -at::numeric_limits<accscalar_t>::max();
accscalar_t threadExp = static_cast<accscalar_t>(0);
// The first smem segment is used to cache input values and the last
// segment is used for thread block reductions
extern __shared__ unsigned char smem[];
auto smem_reduction_cache = reinterpret_cast<accscalar_t*>(smem);
using LoadT = at::native::memory::aligned_vector<scalar_t, ILP>;
const LoadT* const input_vec_ptr = reinterpret_cast<const LoadT*>(input);
// Do the first step in max calculation:
MaxFloat<scalar_t, accscalar_t> maxFunc;
for (index_t offset = threadIdx.x; offset * ILP < classes; offset += blockDim.x) {
LoadT crnt_vec = input_vec_ptr[offset];
#pragma unroll
for (int i = 0; i < ILP; ++i) {
threadMax = maxFunc(threadMax, crnt_vec.val[i]);
}
}
accscalar_t max_k = blockReduceWarp<Max, accscalar_t>(smem_reduction_cache, threadMax,
Max<accscalar_t>(), -at::numeric_limits<accscalar_t>::max());
// Do the second step in sum exp calculation:
SumExpfFloat<scalar_t, accscalar_t> sumExpFunc(max_k);
for (index_t offset = threadIdx.x; offset * ILP < classes; offset += blockDim.x) {
LoadT crnt_vec = input_vec_ptr[offset];
#pragma unroll
for (int i = 0; i < ILP; ++i) {
threadExp = sumExpFunc(threadExp, crnt_vec.val[i]);
}
}
accscalar_t sumAll = blockReduceWarpInverse<Add, accscalar_t>(smem_reduction_cache, threadExp,
Add<accscalar_t>(), static_cast<accscalar_t>(0));
EpilogueWithMul<scalar_t, accscalar_t, outscalar_t> epilogue(max_k, sumAll);
using StoreT = at::native::memory::aligned_vector<outscalar_t, ILP>;
StoreT* output_vec_ptr = reinterpret_cast<StoreT*>(output);
for (index_t offset = threadIdx.x; offset * ILP < classes; offset += blockDim.x) {
LoadT crnt_vec = input_vec_ptr[offset];
StoreT out_vec;
#pragma unroll
for (int i = 0; i < ILP; ++i) {
out_vec.val[i] = epilogue(crnt_vec.val[i]);
}
output_vec_ptr[offset] = out_vec;
}
}
template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t,
template <typename, typename, typename> class Epilogue, typename index_t = int32_t>
__global__ void
@ -1069,9 +935,7 @@ cunn_SoftMaxBackwardSmem(scalar_t *gradInput, const outscalar_t *output, const o
}
}
template<template<typename, typename, typename> class Epilogue,
template<typename, typename, typename> class EpilogueWithMul, bool is_log_softmax, bool use_fast_softmax>
template<template<typename, typename, typename> class Epilogue, bool is_log_softmax>
Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_to_float, const Tensor& output){
if (half_to_float) {
TORCH_CHECK(input_.scalar_type() == ScalarType::Half, "conversion is supported for Half type only");
@ -1113,78 +977,66 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
}
} else {
constexpr int ILP = sizeof(float4) / sizeof(scalar_t);
if constexpr (use_fast_softmax) {
dim3 block(512);
size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
if (dim_size % ILP == 0) {
cunn_SoftMaxForwardGmem<ILP, scalar_t, accscalar_t, scalar_t, EpilogueWithMul>
dim3 block = SoftMaxForward_getBlockSize(dim_size);
size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
smem_reduction_sz) / sizeof(scalar_t);
bool can_use_smem = static_cast<size_t>(dim_size) < max_elements_per_smem;
can_use_smem &= !(reinterpret_cast<uintptr_t>(input_ptr) % ALIGN_BYTES);
can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES));
can_use_smem &= !(dim_size % ILP);
int32_t potential_reg_cnt = potential_register_count(dim_size, block.x);
if(potential_reg_cnt < 10){
TORCH_INTERNAL_ASSERT(potential_reg_cnt > 0, "potential_reg_cnt for softmax with register should be greater than 0.");
switch (potential_reg_cnt) {
// TODO(Wenqin): try to investigate why we couldn't use macro for below code,
// because it seems on MSVS, it seems the macro way didn't expand correct.
case 1:
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 1>
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
} else {
cunn_SoftMaxForwardFast<ILP, scalar_t, accscalar_t, scalar_t, EpilogueWithMul>
break;
case 2:
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 2>
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
break;
case 3:
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 3>
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
break;
case 4:
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 4>
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
break;
case 5:
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 5>
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
break;
case 6:
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 6>
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
break;
case 7:
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 7>
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
break;
case 8:
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 8>
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
break;
case 9:
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 9>
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
break;
}
} else if (can_use_smem) {
size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz;
cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, scalar_t, Epilogue>
<<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size);
} else {
dim3 block = SoftMaxForward_getBlockSize(dim_size);
size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
smem_reduction_sz) / sizeof(scalar_t);
bool can_use_smem = static_cast<size_t>(dim_size) < max_elements_per_smem;
can_use_smem &= !(reinterpret_cast<uintptr_t>(input_ptr) % ALIGN_BYTES);
can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES));
can_use_smem &= !(dim_size % ILP);
int32_t potential_reg_cnt = potential_register_count(dim_size, block.x);
if(potential_reg_cnt < 10){
TORCH_INTERNAL_ASSERT(potential_reg_cnt > 0, "potential_reg_cnt for softmax with register should be greater than 0.");
switch (potential_reg_cnt) {
// TODO(Wenqin): try to investigate why we couldn't use macro for below code,
// because it seems on MSVS, it seems the macro way didn't expand correct.
case 1:
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 1>
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
break;
case 2:
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 2>
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
break;
case 3:
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 3>
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
break;
case 4:
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 4>
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
break;
case 5:
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 5>
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
break;
case 6:
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 6>
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
break;
case 7:
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 7>
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
break;
case 8:
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 8>
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
break;
case 9:
cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 9>
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
break;
}
} else if (can_use_smem) {
size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz;
cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, scalar_t, Epilogue>
<<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size);
} else {
cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, scalar_t, Epilogue>
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
}
cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, scalar_t, Epilogue>
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
}
C10_CUDA_KERNEL_LAUNCH_CHECK();
@ -1204,35 +1056,23 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
}
} else {
constexpr int ILP = sizeof(float4) / sizeof(scalar_t);
if constexpr (use_fast_softmax) {
dim3 block(512);
size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
if (dim_size % ILP == 0) {
cunn_SoftMaxForwardGmem<ILP, scalar_t, accscalar_t, accscalar_t, EpilogueWithMul>
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
} else {
cunn_SoftMaxForwardFast<ILP, scalar_t, accscalar_t, accscalar_t, EpilogueWithMul>
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
}
dim3 block = SoftMaxForward_getBlockSize(dim_size);
size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
smem_reduction_sz) / sizeof(scalar_t);
bool can_use_smem = static_cast<size_t>(dim_size) < max_elements_per_smem;
can_use_smem &= !(reinterpret_cast<uintptr_t>(input_ptr) % ALIGN_BYTES);
can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES));
can_use_smem &= !(dim_size % ILP);
if (can_use_smem) {
size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz;
cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue>
<<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size);
} else {
dim3 block = SoftMaxForward_getBlockSize(dim_size);
size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
smem_reduction_sz) / sizeof(scalar_t);
bool can_use_smem = static_cast<size_t>(dim_size) < max_elements_per_smem;
can_use_smem &= !(reinterpret_cast<uintptr_t>(input_ptr) % ALIGN_BYTES);
can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES));
can_use_smem &= !(dim_size % ILP);
if (can_use_smem) {
size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz;
cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue>
<<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size);
} else {
cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue>
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
}
cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue>
<<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
}
C10_CUDA_KERNEL_LAUNCH_CHECK();
@ -1412,7 +1252,7 @@ TORCH_IMPL_FUNC(log_softmax_cuda_out) (
const int64_t dim,
const bool half_to_float,
const Tensor &output) {
host_softmax<LogSoftMaxForwardEpilogue, LogSoftMaxForwardEpilogue, true, false>(input, dim, half_to_float, output);
host_softmax<LogSoftMaxForwardEpilogue,true>(input, dim, half_to_float, output);
}
TORCH_IMPL_FUNC(log_softmax_backward_cuda_out) (
@ -1436,11 +1276,7 @@ TORCH_IMPL_FUNC(softmax_cuda_out) (
const int64_t dim,
const bool half_to_float,
const Tensor &output) {
#if defined(USE_ROCM)
host_softmax<SoftMaxForwardEpilogue, SoftMaxForwardWithMulEpilogue, false, true>(input, dim, half_to_float, output);
#else
host_softmax<SoftMaxForwardEpilogue, SoftMaxForwardWithMulEpilogue, false, false>(input, dim, half_to_float, output);
#endif
host_softmax<SoftMaxForwardEpilogue,false>(input, dim, half_to_float, output);
}
TORCH_IMPL_FUNC(softmax_backward_cuda_out)

View File

@ -469,315 +469,11 @@ void dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
}
}
void dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
// If any of the shapes cant be tiled, we must use padding.
bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
// Dispatch to best implementation.
// TODO add more configurations. Optimize.
bool transa_ = std::tolower(transa) != 'n';
bool transb_ = std::tolower(transb) != 'n';
if (use_padding) {
if(transa_ && transb_) { // col , col
gemm_impl_wmma<
at::BFloat16,
256,
128,
256,
64,
8,
16,
16,
4,
4,
S<4, 64, 1>,
S<0, 2, 1>,
S<0, 2, 1>,
1,
1,
8,
true,
S<4, 64, 1>,
S<1, 0, 2>,
S<1, 0, 2>,
2,
8,
8,
true,
1,
1,
S<1, 32, 1, 8>,
8,
true,
true,
true>
(CUDABLAS_GEMM_ARGS(at::BFloat16));
}
else if(transa_ && !transb_) { // row, col
gemm_impl_wmma<
at::BFloat16,
256,
128,
256,
64,
8,
16,
16,
4,
4,
S<4, 64, 1>,
S<1, 0, 2>,
S<1, 0, 2>,
2,
8,
8,
true,
S<4, 64, 1>,
S<1, 0, 2>,
S<1, 0, 2>,
2,
8,
8,
true,
1,
1,
S<1, 32, 1, 8>,
8,
true,
true,
false>
(CUDABLAS_GEMM_ARGS(at::BFloat16));
}
else if(!transa_ && transb_) { //col, row
gemm_impl_wmma<
at::BFloat16,
256,
128,
256,
64,
8,
16,
16,
4,
4,
S<4, 64, 1>,
S<0, 2, 1>,
S<0, 2, 1>,
1,
1,
8,
true,
S<4, 64, 1>,
S<0, 2, 1>,
S<0, 2, 1>,
1,
1,
8,
true,
1,
1,
S<1, 32, 1, 8>,
8,
true,
false,
true>
(CUDABLAS_GEMM_ARGS(at::BFloat16));
}
else if(!transa_ && !transb_) { //row, row
gemm_impl_wmma<
at::BFloat16,
256,
128,
256,
64,
8,
16,
16,
4,
4,
S<4, 64, 1>,
S<1, 0, 2>,
S<1, 0, 2>,
2,
8,
8,
true,
S<4, 64, 1>,
S<0, 2, 1>,
S<0, 2, 1>,
1,
1,
8,
true,
1,
1,
S<1, 32, 1, 8>,
8,
true,
false,
false>
(CUDABLAS_GEMM_ARGS(at::BFloat16));
}
else {
TORCH_CHECK(false, "unreachable");
}
} else {
if(transa_ && transb_) { // col , col
gemm_impl_wmma<
at::BFloat16,
256,
128,
256,
64,
8,
16,
16,
4,
4,
S<4, 64, 1>,
S<0, 2, 1>,
S<0, 2, 1>,
1,
1,
8,
true,
S<4, 64, 1>,
S<1, 0, 2>,
S<1, 0, 2>,
2,
8,
8,
true,
1,
1,
S<1, 32, 1, 8>,
8,
false,
true,
true>
(CUDABLAS_GEMM_ARGS(at::BFloat16));
}
else if(transa_ && !transb_) { // row, col
gemm_impl_wmma<
at::BFloat16,
256,
128,
256,
64,
8,
16,
16,
4,
4,
S<4, 64, 1>,
S<1, 0, 2>,
S<1, 0, 2>,
2,
8,
8,
true,
S<4, 64, 1>,
S<1, 0, 2>,
S<1, 0, 2>,
2,
8,
8,
true,
1,
1,
S<1, 32, 1, 8>,
8,
false,
true,
false>
(CUDABLAS_GEMM_ARGS(at::BFloat16));
}
else if(!transa_ && transb_) { //col, row
gemm_impl_wmma<
at::BFloat16,
256,
128,
256,
64,
8,
16,
16,
4,
4,
S<4, 64, 1>,
S<0, 2, 1>,
S<0, 2, 1>,
1,
1,
8,
true,
S<4, 64, 1>,
S<0, 2, 1>,
S<0, 2, 1>,
1,
1,
8,
true,
1,
1,
S<1, 32, 1, 8>,
8,
false,
false,
true>
(CUDABLAS_GEMM_ARGS(at::BFloat16));
}
else if(!transa_ && !transb_) { //row, row
gemm_impl_wmma<
at::BFloat16,
256,
128,
256,
64,
8,
16,
16,
4,
4,
S<4, 64, 1>,
S<1, 0, 2>,
S<1, 0, 2>,
2,
8,
8,
true,
S<4, 64, 1>,
S<0, 2, 1>,
S<0, 2, 1>,
1,
1,
8,
true,
1,
1,
S<1, 32, 1, 8>, 8,
false,
false,
false>
(CUDABLAS_GEMM_ARGS(at::BFloat16));
}
else {
TORCH_CHECK(false, "unreachable");
}
}
}
template <>
void gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
auto dprops = at::cuda::getCurrentDeviceProperties();
c10::string_view arch(dprops->gcnArchName);
if (arch == "gfx1100") {
dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGS(at::BFloat16));
} else{
dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGS(at::BFloat16));
}
dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGS(at::BFloat16));
}
} // namespace at::native

View File

@ -297,314 +297,10 @@ void dispatch_half_gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
}
#endif
}
void dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
// If any of the shapes cant be tiled, we must use padding.
bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
// Dispatch to best implementation.
// TODO add more configurations. Optimize.
bool transa_ = std::tolower(transa) != 'n';
bool transb_ = std::tolower(transb) != 'n';
if (use_padding) {
if(transa_ && transb_) { // col , col
gemm_impl_wmma<
at::Half,
256,
128,
256,
64,
8,
16,
16,
4,
4,
S<4, 64, 1>,
S<0, 2, 1>,
S<0, 2, 1>,
1,
1,
8,
true,
S<4, 64, 1>,
S<1, 0, 2>,
S<1, 0, 2>,
2,
8,
8,
true,
1,
1,
S<1, 32, 1, 8>,
8,
true,
true,
true>
(CUDABLAS_GEMM_ARGS(at::Half));
}
else if(transa_ && !transb_) { // row, col
gemm_impl_wmma<
at::Half,
256,
128,
256,
64,
8,
16,
16,
4,
4,
S<4, 64, 1>,
S<1, 0, 2>,
S<1, 0, 2>,
2,
8,
8,
true,
S<4, 64, 1>,
S<1, 0, 2>,
S<1, 0, 2>,
2,
8,
8,
true,
1,
1,
S<1, 32, 1, 8>,
8,
true,
true,
false>
(CUDABLAS_GEMM_ARGS(at::Half));
}
else if(!transa_ && transb_) { //col, row
gemm_impl_wmma<
at::Half,
256,
128,
256,
64,
8,
16,
16,
4,
4,
S<4, 64, 1>,
S<0, 2, 1>,
S<0, 2, 1>,
1,
1,
8,
true,
S<4, 64, 1>,
S<0, 2, 1>,
S<0, 2, 1>,
1,
1,
8,
true,
1,
1,
S<1, 32, 1, 8>,
8,
true,
false,
true>
(CUDABLAS_GEMM_ARGS(at::Half));
}
else if(!transa_ && !transb_) { //row, row
gemm_impl_wmma<
at::Half,
256,
128,
256,
64,
8,
16,
16,
4,
4,
S<4, 64, 1>,
S<1, 0, 2>,
S<1, 0, 2>,
2,
8,
8,
true,
S<4, 64, 1>,
S<0, 2, 1>,
S<0, 2, 1>,
1,
1,
8,
true,
1,
1,
S<1, 32, 1, 8>,
8,
true,
false,
false>
(CUDABLAS_GEMM_ARGS(at::Half));
}
else {
TORCH_CHECK(false, "unreachable");
}
} else {
if(transa_ && transb_) { // col , col
gemm_impl_wmma<
at::Half,
256,
128,
256,
64,
8,
16,
16,
4,
4,
S<4, 64, 1>,
S<0, 2, 1>,
S<0, 2, 1>,
1,
1,
8,
true,
S<4, 64, 1>,
S<1, 0, 2>,
S<1, 0, 2>,
2,
8,
8,
true,
1,
1,
S<1, 32, 1, 8>,
8,
false,
true,
true>
(CUDABLAS_GEMM_ARGS(at::Half));
}
else if(transa_ && !transb_) { // row, col
gemm_impl_wmma<
at::Half,
256,
128,
256,
64,
8,
16,
16,
4,
4,
S<4, 64, 1>,
S<1, 0, 2>,
S<1, 0, 2>,
2,
8,
8,
true,
S<4, 64, 1>,
S<1, 0, 2>,
S<1, 0, 2>,
2,
8,
8,
true,
1,
1,
S<1, 32, 1, 8>,
8,
false,
true,
false>
(CUDABLAS_GEMM_ARGS(at::Half));
}
else if(!transa_ && transb_) { //col, row
gemm_impl_wmma<
at::Half,
256,
128,
256,
64,
8,
16,
16,
4,
4,
S<4, 64, 1>,
S<0, 2, 1>,
S<0, 2, 1>,
1,
1,
8,
true,
S<4, 64, 1>,
S<0, 2, 1>,
S<0, 2, 1>,
1,
1,
8,
true,
1,
1,
S<1, 32, 1, 8>,
8,
false,
false,
true>
(CUDABLAS_GEMM_ARGS(at::Half));
}
else if(!transa_ && !transb_) { //row, row
gemm_impl_wmma<
at::Half,
256,
128,
256,
64,
8,
16,
16,
4,
4,
S<4, 64, 1>,
S<1, 0, 2>,
S<1, 0, 2>,
2,
8,
8,
true,
S<4, 64, 1>,
S<0, 2, 1>,
S<0, 2, 1>,
1,
1,
8,
true,
1,
1,
S<1, 32, 1, 8>, 8,
false,
false,
false>
(CUDABLAS_GEMM_ARGS(at::Half));
}
else {
TORCH_CHECK(false, "unreachable");
}
}
}
template <>
void gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
auto dprops = at::cuda::getCurrentDeviceProperties();
c10::string_view arch(dprops->gcnArchName);
if (arch == "gfx1100") {
dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGS(at::Half));
} else{
dispatch_half_gemm(CUDABLAS_GEMM_ARGS(at::Half));
}
dispatch_half_gemm(CUDABLAS_GEMM_ARGS(at::Half));
}
} // namespace at::native

View File

@ -30,7 +30,6 @@
#include <ck/library/utility/literals.hpp>
#include <ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp>
#include <ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp>
// Define commonly used types.
template <ck::index_t... Is>
@ -237,180 +236,4 @@ void gemm_impl(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
invoker.Run(argument, StreamConfig{stream, false});
}
template <
typename Dtype,
int BLOCK_SIZE,
int MBLOCK,
int NBLOCK,
int KBLOCK,
int K1,
int MPER_WMMA,
int NPER_WMMA,
int MPER_WAVE,
int NPER_WAVE,
typename ABLOCK_CLUSTER_LENS,
typename ABLOCK_CLUSTER_ORDER,
typename ABLOCK_SRC_ORDER,
int ABLOCK_VECTOR_DIM,
int ABLOCK_SCALAR_VEC,
int ABLOCK_SCALAR_VEC_K1,
bool ABLOCK_LDS_EXTRAM,
typename BBLOCK_CLUSTER_LENS,
typename BBLOCK_CLUSTER_ORDER,
typename BBLOCK_SRC_ORDER,
int BBLOCK_VECTOR_DIM,
int BBLOCK_SCALAR_VEC,
int BBLOCK_SCALAR_VEC_AK1,
bool BBLOCK_LDS_EXTRAN,
int CMPER_WAVE,
int CNPER_WAVE,
typename CBLOCK_CLUSTER_LENS,
int CNPER_BLOCK,
bool PADDING = false,
bool TRANSA = false,
bool TRANSB = false>
void gemm_impl_wmma(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
// Get input information.
int M = m;
int N = n;
int K = k;
int StrideA = lda;
int StrideB = ldb;
int StrideC = ldc;
int KBatch = 1;
float falpha = alpha;
float fbeta = beta;
using ADataType = typename CkMathType<Dtype>::dtype;
using BDataType = typename CkMathType<Dtype>::dtype;
using CDataType = typename CkMathType<Dtype>::dtype;
using DDataType = typename CkMathType<Dtype>::dtype;
using AccDataType = float;
using CShuffleDataType = typename CkMathType<Dtype>::dtype;
using ALayout = typename CkTensorLayout<TRANSA, TRANSB>::a_layout;
using BLayout = typename CkTensorLayout<TRANSA, TRANSB>::b_layout;
using DLayout = Row;
using CLayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;
static constexpr auto GemmDefault =
ck::tensor_operation::device::GemmSpecialization::Default;
static constexpr auto GemmMNKPadding =
ck::tensor_operation::device::GemmSpecialization::MNKPadding;
static constexpr auto GemmSpec = PADDING ? GemmMNKPadding : GemmDefault;
using DeviceGemmInstance =
ck::tensor_operation::device::DeviceGemmWmma_CShuffle<ALayout,
BLayout,
CLayout,
ADataType,
BDataType,
CDataType,
AccDataType,
CShuffleDataType,
AElementOp,
BElementOp,
CElementOp,
GemmSpec,
1, // NumPrefetch
BLOCK_SIZE,
MBLOCK,
NBLOCK,
KBLOCK,
K1,
MPER_WMMA,
NPER_WMMA,
MPER_WAVE,
NPER_WAVE,
ABLOCK_CLUSTER_LENS,
ABLOCK_CLUSTER_ORDER,
ABLOCK_SRC_ORDER,
ABLOCK_VECTOR_DIM,
ABLOCK_SCALAR_VEC,
ABLOCK_SCALAR_VEC_K1,
ABLOCK_LDS_EXTRAM,
BBLOCK_CLUSTER_LENS,
BBLOCK_CLUSTER_ORDER,
BBLOCK_SRC_ORDER,
BBLOCK_VECTOR_DIM,
BBLOCK_SCALAR_VEC,
BBLOCK_SCALAR_VEC_AK1,
BBLOCK_LDS_EXTRAN,
CMPER_WAVE,
CNPER_WAVE,
CBLOCK_CLUSTER_LENS,
CNPER_BLOCK>;
auto gemm = DeviceGemmInstance{};
auto invoker = gemm.MakeInvoker();
auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{};
auto c_element_op = CElementOp{};
using DDataArrayType = std::array<const void*, 0>;
DDataArrayType DDataArray;
// We swap A and B inputs here as a temporary workaround
auto argument = gemm.MakeArgument(
reinterpret_cast<const ADataType*>(b),
reinterpret_cast<const BDataType*>(a),
reinterpret_cast<CDataType*>(c),
N,
M,
K,
StrideB,
StrideA,
StrideC,
b_element_op,
a_element_op,
c_element_op);
if(!gemm.IsSupportedArgument(argument))
{
printf("error shape = %d %d %d TRANSA=%d TRANSB=%d \n",
n, m, k,TRANSA, TRANSB);
throw std::runtime_error(
"wrong! device_gemm with the specified compilation parameters does "
"not support this GEMM problem");
}
auto stream = at::cuda::getCurrentHIPStream().stream();
#if 1
invoker.Run(argument, StreamConfig{stream, false});
#else
float ave_time = invoker.Run(argument, StreamConfig{stream, true});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << N <<" " <<M<<" " << k <<" "
<< "stride: "<<StrideA <<" "<<StrideB <<" "<<StrideC <<" "
<< gemm.GetTypeString()
<< std::endl;
#endif
}
} // namespace at::native

View File

@ -311,8 +311,9 @@ void gpu_float_sdpa(
bool is_causal,
float softmax_scale,
const Tensor& output) {
auto& eng = GpuEngineManager::Instance().get_engine();
auto& strm = GpuStreamManager::Instance().get_stream();
auto eng = GpuEngineManager::Instance().get_engine(
{c10::kXPU, c10::xpu::current_device()});
auto strm = GpuStreamManager::Instance().get_stream();
const auto get_tril_mask = [&]() {
auto opts = query.options();

View File

@ -338,7 +338,8 @@ class Attr {
// [1, C, 1, 1], channel broadcast
// [dst.shape], no broadcast and eltwise-wise binary operations on dst
auto& engine = GpuEngineManager::Instance().get_engine();
auto engine = GpuEngineManager::Instance().get_engine(
{c10::kXPU, c10::xpu::current_device()});
for (size_t i = 0; i < ops_params_.size(); ++i) {
kind_t kind = ops_params_[i].kind_;
if (kind == kind_t::binary) {

View File

@ -83,8 +83,9 @@ sycl::event convolution(
int64_t groups,
Attr& attr,
const std::vector<sycl::event>& deps) {
auto& engine = GpuEngineManager::Instance().get_engine();
auto& stream = GpuStreamManager::Instance().get_stream();
auto engine = GpuEngineManager::Instance().get_engine(
{c10::kXPU, c10::xpu::current_device()});
auto stream = GpuStreamManager::Instance().get_stream();
bool is_channels_last = use_channels_last_for_conv(src, weight);
@ -183,8 +184,9 @@ sycl::event convolution_backward_weights(
IntArrayRef dilation,
int64_t groups,
const std::vector<sycl::event>& deps) {
auto& engine = GpuEngineManager::Instance().get_engine();
auto& stream = GpuStreamManager::Instance().get_stream();
auto engine = GpuEngineManager::Instance().get_engine(
{c10::kXPU, c10::xpu::current_device()});
auto stream = GpuStreamManager::Instance().get_stream();
bool is_channels_last = use_channels_last_for_conv(src, diff_dst);
@ -290,8 +292,9 @@ sycl::event convolution_backward_data(
int64_t groups,
bool bias_defined,
const std::vector<sycl::event>& deps) {
auto& engine = GpuEngineManager::Instance().get_engine();
auto& stream = GpuStreamManager::Instance().get_stream();
auto engine = GpuEngineManager::Instance().get_engine(
{c10::kXPU, c10::xpu::current_device()});
auto stream = GpuStreamManager::Instance().get_stream();
bool is_channels_last = use_channels_last_for_conv(diff_dst, weight);

View File

@ -158,8 +158,9 @@ sycl::event deconvolution(
int64_t groups,
Attr& attr,
const std::vector<sycl::event>& deps) {
auto& engine = GpuEngineManager::Instance().get_engine();
auto& stream = GpuStreamManager::Instance().get_stream();
auto engine = GpuEngineManager::Instance().get_engine(
{c10::kXPU, c10::xpu::current_device()});
auto stream = GpuStreamManager::Instance().get_stream();
bool is_channels_last_suggested = use_channels_last_for_conv(src, weight);
@ -248,8 +249,9 @@ sycl::event deconvolution_backward_data(
int64_t groups,
bool bias_defined,
const std::vector<sycl::event>& deps) {
auto& engine = GpuEngineManager::Instance().get_engine();
auto& stream = GpuStreamManager::Instance().get_stream();
auto engine = GpuEngineManager::Instance().get_engine(
{c10::kXPU, c10::xpu::current_device()});
auto stream = GpuStreamManager::Instance().get_stream();
bool is_channels_last_suggested =
use_channels_last_for_conv(diff_dst, weight);
@ -345,8 +347,9 @@ sycl::event deconvolution_backward_weights(
IntArrayRef dilation,
int64_t groups,
const std::vector<sycl::event>& deps) {
auto& engine = GpuEngineManager::Instance().get_engine();
auto& stream = GpuStreamManager::Instance().get_stream();
auto engine = GpuEngineManager::Instance().get_engine(
{c10::kXPU, c10::xpu::current_device()});
auto stream = GpuStreamManager::Instance().get_stream();
bool is_channels_last_suggested = use_channels_last_for_conv(src, diff_dst);

View File

@ -30,8 +30,9 @@ sycl::event matmul(
"oneDNN input matrixes must have the same ranks");
TORCH_CHECK(result.defined(), "oneDNN matmul result should be defined");
auto& engine = GpuEngineManager::Instance().get_engine();
auto& stream = GpuStreamManager::Instance().get_stream();
at::Device cur_device = at::Device(at::kXPU, c10::xpu::current_device());
auto engine = GpuEngineManager::Instance().get_engine(cur_device);
auto stream = GpuStreamManager::Instance().get_stream();
at::Tensor m1 = mat1;
at::Tensor m2 = mat2;

View File

@ -107,8 +107,9 @@ at::Tensor quantized_convolution(
output.defined(),
"A valid output is required for quantized convolution.");
auto& engine = GpuEngineManager::Instance().get_engine();
auto& stream = GpuStreamManager::Instance().get_stream();
auto engine = GpuEngineManager::Instance().get_engine(
{c10::kXPU, c10::xpu::current_device()});
auto stream = GpuStreamManager::Instance().get_stream();
// input tensors config
dnnl::memory::dims src_dims = act.sizes().vec();

View File

@ -125,8 +125,9 @@ void quantized_matmul(
attr);
size_t dims = result.dim();
auto& engine = GpuEngineManager::Instance().get_engine();
auto& stream = GpuStreamManager::Instance().get_stream();
at::Device cur_device = at::Device(at::kXPU, c10::xpu::current_device());
auto engine = GpuEngineManager::Instance().get_engine(cur_device);
auto stream = GpuStreamManager::Instance().get_stream();
at::Tensor m1 = is_onednn_matmul_strides(mat1) ? mat1 : mat1.contiguous();
at::Tensor m2 = is_onednn_matmul_strides(mat2) ? mat2 : mat2.contiguous();

View File

@ -29,7 +29,8 @@ static inline void dnnl_delete(
}
GpuEngineManager::GpuEngineManager() {
c10::DeviceIndex device_count = c10::xpu::device_count_ensure_non_zero();
c10::DeviceIndex device_count = c10::xpu::device_count();
TORCH_INTERNAL_ASSERT(device_count > 0);
for (const auto i : c10::irange(device_count)) {
static dnnl::graph::allocator alloc =
dnnl::graph::sycl_interop::make_allocator(dnnl_alloc, dnnl_delete);

View File

@ -25,15 +25,10 @@ bool set_onednn_verbose(int level);
struct TORCH_XPU_API GpuEngineManager {
static GpuEngineManager& Instance(); // Singleton
dnnl::engine& get_engine(
DeviceIndex device_index = c10::xpu::current_device()) {
c10::xpu::check_device_index(device_index);
return *engine_pool[device_index];
}
dnnl::engine& get_engine(const Device& device) {
TORCH_INTERNAL_ASSERT(device.type() == kXPU);
return get_engine(device.index());
TORCH_INTERNAL_ASSERT(device.index() < c10::xpu::device_count());
return *engine_pool[device.index()];
}
GpuEngineManager(GpuEngineManager const&) = delete;
@ -53,15 +48,16 @@ struct TORCH_XPU_API GpuEngineManager {
struct TORCH_XPU_API GpuStreamManager {
static GpuStreamManager& Instance(); // Singleton
dnnl::stream& get_stream(
DeviceIndex device_index = c10::xpu::current_device()) {
auto stream = c10::xpu::getCurrentXPUStream(device_index);
dnnl::stream get_stream() {
auto stream = c10::xpu::getCurrentXPUStream();
auto priority = stream.priority();
auto device_index = stream.device_index();
if (stream_pool[device_index][priority].find(stream) ==
stream_pool[device_index][priority].end()) {
stream_pool[device_index][priority][stream] =
std::make_shared<dnnl::stream>(dnnl::sycl_interop::make_stream(
GpuEngineManager::Instance().get_engine(device_index),
GpuEngineManager::Instance().get_engine(
{c10::kXPU, device_index}),
stream.queue()));
}
return *stream_pool[device_index][priority][stream];
@ -74,7 +70,8 @@ struct TORCH_XPU_API GpuStreamManager {
protected:
GpuStreamManager() {
c10::DeviceIndex device_count = c10::xpu::device_count_ensure_non_zero();
c10::DeviceIndex device_count = c10::xpu::device_count();
TORCH_INTERNAL_ASSERT(device_count > 0);
stream_pool.resize(device_count);
}
~GpuStreamManager() = default;

View File

@ -133,10 +133,6 @@ class MetalShaderLibrary {
TensorIteratorBase& iter,
const std::string& name,
std::optional<int64_t> extra = std::nullopt);
void exec_binary_kernel(
TensorIteratorBase& iter,
const std::string& name,
const bool supports_dense = true);
protected:
virtual MTLLibrary_t getLibrary();

View File

@ -1010,49 +1010,6 @@ void MetalShaderLibrary::exec_unary_kernel(TensorIteratorBase& iter,
}
}
void MetalShaderLibrary::exec_binary_kernel(TensorIteratorBase& iter,
const std::string& name,
const bool supports_dense) {
TORCH_CHECK(iter.common_dtype() != at::kDouble, "float64 is not supported on MPS");
Tensor input = iter.input(0);
Tensor other = iter.input(1);
Tensor out = iter.output();
id<MTLDevice> device = MPSDevice::getInstance()->device();
MPSStream* mpsStream = getCurrentMPSStream();
const uint32_t nDim = iter.ndim();
constexpr uint32_t nOffsets = 3;
const uint32_t numThreads = iter.numel();
dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
@autoreleasepool {
auto computeEncoder = mpsStream->commandEncoder();
if (supports_dense && iter.is_contiguous()) {
const auto kernel_name = fmt::format("{}_dense_{}", name, scalarToMetalTypeString(input));
auto binaryPSO = getPipelineStateForFunc(kernel_name);
[computeEncoder setComputePipelineState:binaryPSO];
mtl_setArgs(computeEncoder, input, other, out);
mtl_dispatch1DJob(computeEncoder, binaryPSO, numThreads);
return;
}
const auto kernel = fmt::format("{}_{}", name, scalarToMetalTypeString(input));
auto kernelDataOffsets = generateKernelDataOffsets(computeEncoder, iter);
auto binaryPSO = getPipelineStateForFunc(kernel);
// this function call is a no-op if MPS Profiler is not enabled
getMPSProfiler().beginProfileKernel(binaryPSO, kernel, {input, other});
[computeEncoder setComputePipelineState:binaryPSO];
mtl_setArgs(computeEncoder, input, other, out);
[computeEncoder setBuffer:kernelDataOffsets offset:0 atIndex:3];
mtl_dispatch1DJob(computeEncoder, binaryPSO, numThreads);
getMPSProfiler().endProfileKernel(binaryPSO);
}
});
}
MetalShaderLibrary& MetalShaderLibrary::getBundledLibrary() {
static BundledShaderLibary l;
return l;

View File

@ -1,4 +1,3 @@
#include <c10/metal/indexing.h>
#include <c10/metal/special_math.h>
#include <c10/metal/utils.h>
#include <metal_stdlib>
@ -92,6 +91,59 @@ struct polar_functor {
}
};
// Future BinaryTensorIterator
template <typename T, typename F>
using result_of = decltype(::metal::declval<F>()(
::metal::declval<T>(),
::metal::declval<T>()));
template <typename T, typename F>
kernel void binary_indexing(
constant void* input_ [[buffer(0)]],
constant void* other_ [[buffer(1)]],
device void* out_ [[buffer(2)]],
constant uint3* offsets [[buffer(3)]],
uint tid [[thread_position_in_grid]]) {
auto out = (device result_of<T, F>*)((device uint8_t*)out_ + offsets[tid].x);
auto input = (constant T*)((constant uint8_t*)input_ + offsets[tid].y);
auto other = (constant T*)((constant uint8_t*)other_ + offsets[tid].z);
F f;
*out = f(*input, *other);
}
template <typename T, typename F>
kernel void binary_dense(
constant T* input [[buffer(0)]],
constant T* other [[buffer(1)]],
device result_of<T, F>* out [[buffer(2)]],
uint tid [[thread_position_in_grid]]) {
F f;
out[tid] = f(input[tid], other[tid]);
}
#define REGISTER_BINARY_INDEXING_OP(NAME, DTYPE) \
template [[host_name(#NAME "_" #DTYPE)]] kernel void \
binary_indexing<DTYPE, NAME##_functor>( \
constant void* input_, \
constant void* other_, \
device void* out_, \
constant uint3* offsets, \
uint tid); \
template [[host_name(#NAME "_dense_" #DTYPE)]] kernel void \
binary_dense<DTYPE, NAME##_functor>( \
constant DTYPE * input_, \
constant DTYPE * other_, \
device result_of<DTYPE, NAME##_functor> * out_, \
uint tid)
#define REGISTER_BINARY_OP(NAME, DTYPE) \
template [[host_name(#NAME "_" #DTYPE)]] kernel void NAME<DTYPE>( \
constant void* input_, \
constant void* other_, \
device void* out_, \
constant uint3* offsets, \
uint tid)
REGISTER_BINARY_INDEXING_OP(copysign, long);
REGISTER_BINARY_INDEXING_OP(copysign, int);
REGISTER_BINARY_INDEXING_OP(copysign, float);
@ -138,7 +190,9 @@ kernel void complex_mul(
out[1] = input[0] * other[1] + input[1] * other[0];
}
// Constructs complex tensor from real and imaginary planes
REGISTER_BINARY_OP(complex_mul, float);
REGISTER_BINARY_OP(complex_mul, half);
template <typename T>
kernel void complex_kernel(
constant void* real_ [[buffer(0)]],
@ -153,15 +207,5 @@ kernel void complex_kernel(
out[1] = imag[0];
}
#define REGISTER_BINARY_OP(NAME, DTYPE) \
template [[host_name(#NAME "_" #DTYPE)]] kernel void NAME<DTYPE>( \
constant void* input_, \
constant void* other_, \
device void* out_, \
constant uint3* offsets, \
uint tid)
REGISTER_BINARY_OP(complex_mul, float);
REGISTER_BINARY_OP(complex_mul, half);
REGISTER_BINARY_OP(complex_kernel, float);
REGISTER_BINARY_OP(complex_kernel, half);

View File

@ -1,63 +1,16 @@
#include <c10/metal/indexing.h>
#include <c10/metal/special_math.h>
using namespace c10::metal;
using namespace metal;
DEFINE_UNARY_FLOATING_FUNCTOR(bessel_j0_forward);
DEFINE_UNARY_FLOATING_FUNCTOR(bessel_j1_forward);
DEFINE_UNARY_FLOATING_FUNCTOR(modified_bessel_i0_forward);
DEFINE_UNARY_FLOATING_FUNCTOR(modified_bessel_i1_forward);
DEFINE_UNARY_FLOATING_FUNCTOR(i0);
DEFINE_UNARY_FLOATING_FUNCTOR(i0e);
DEFINE_UNARY_FLOATING_FUNCTOR(i1);
DEFINE_UNARY_FLOATING_FUNCTOR(i1e);
DEFINE_UNARY_FLOATING_FUNCTOR(spherical_bessel_j0);
DEFINE_UNARY_FLOATING_FUNCTOR(entr);
// TODO: Replaceme with DEFINE_UNARY_FLOATING_FUNCTOR
// But for some reason instantinating bessel_y[01] on M1/M2 results in
// Failed to created pipeline state object, error: Error Domain=AGXMetalG14X
// Code=3 "Compiler encountered an internal error"
struct bessel_y0_forward_functor {
template <typename T>
inline enable_if_t<is_floating_point_v<T>, T> operator()(const T x) {
return static_cast<T>(bessel_y0_forward(x));
}
template <typename T>
inline enable_if_t<is_integral_v<T>, float> operator()(const T x) {
return bessel_y0_forward(static_cast<float>(x));
}
inline float operator()(const bool x) {
return x ? 0.08825694769620895 : -INFINITY;
}
};
struct bessel_y1_forward_functor {
template <typename T>
inline enable_if_t<is_floating_point_v<T>, T> operator()(const T x) {
return static_cast<T>(bessel_y1_forward(x));
}
template <typename T>
inline enable_if_t<is_integral_v<T>, float> operator()(const T x) {
return bessel_y1_forward(static_cast<float>(x));
}
inline float operator()(const bool x) {
return x ? -0.7812128067016602 : -INFINITY;
}
};
#define REGISTER_SPECIAL(DTI, DTO) \
REGISTER_UNARY_OP(bessel_j0_forward, DTI, DTO); \
REGISTER_UNARY_OP(bessel_j1_forward, DTI, DTO); \
REGISTER_UNARY_OP(modified_bessel_i0_forward, DTI, DTO); \
REGISTER_UNARY_OP(modified_bessel_i1_forward, DTI, DTO); \
REGISTER_UNARY_OP(bessel_y0_forward, DTI, DTO); \
REGISTER_UNARY_OP(bessel_y1_forward, DTI, DTO); \
REGISTER_UNARY_OP(i0, DTI, DTO); \
REGISTER_UNARY_OP(i0e, DTI, DTO); \
REGISTER_UNARY_OP(i1, DTI, DTO); \
REGISTER_UNARY_OP(i1e, DTI, DTO); \
REGISTER_UNARY_OP(spherical_bessel_j0, DTI, DTO); \
#define REGISTER_SPECIAL(DTI, DTO) \
REGISTER_UNARY_OP(i0, DTI, DTO); \
REGISTER_UNARY_OP(i1, DTI, DTO); \
REGISTER_UNARY_OP(spherical_bessel_j0, DTI, DTO); \
REGISTER_UNARY_OP(entr, DTI, DTO)
REGISTER_SPECIAL(float, float);

View File

@ -268,31 +268,12 @@ kernel void upsample_bilinear2d(
}
}
struct BilinearFunctor {
inline float operator()(float x) {
x = abs(x);
return x < 1.0 ? 1.0 - x : x;
}
static constant constexpr float area_factor = 1.0;
};
inline float bilinear_functor(float x) {
return abs(x) < 1.0 ? 1.0 - abs(x) : abs(x);
}
struct BicubicFunctor {
inline float operator()(float x) {
// https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
x = abs(x);
if (x < 1.0) {
return 1.0 + (1.5 * x - 2.5) * x * x;
}
if (x < 2.0) {
return 2.0 - 0.5 * ((x - 5.0) * x + 8.0) * x;
}
return 0;
}
static constant constexpr float area_factor = 2.0;
};
template <typename T, typename F>
kernel void upsample_2d_aa(
template <typename T>
kernel void upsample_bilinear2d_aa(
constant T* inputData [[buffer(0)]],
device T* outputData [[buffer(1)]],
constant ulong4& input_strides [[buffer(2)]],
@ -305,26 +286,15 @@ kernel void upsample_2d_aa(
auto output_x = thread_index % static_cast<uint>(output_sizes.w);
auto output_y = thread_index / static_cast<uint>(output_sizes.w);
(void)align_corners; // Align corners is unused for AA algorithm
F f;
auto x_center = area_pixel_compute_source_index(
scales.x,
output_x,
/*align_corners=*/false,
/*cubic=*/F::area_factor == 2.0);
scales.x, output_x, /*align_corners=*/false, /*cubic=*/false);
auto y_center = area_pixel_compute_source_index(
scales.y,
output_y,
/*align_corners=*/false,
/*cubic=*/F::area_factor == 2.0);
scales.y, output_y, /*align_corners=*/false, /*cubic=*/false);
auto clamped_scales = max(1.0, scales);
auto x_min =
max(0L, long(floor(x_center - f.area_factor * clamped_scales.x + 1)));
auto x_max = min(
input_sizes.w, long(ceil(x_center + f.area_factor * clamped_scales.x)));
auto y_min =
max(0L, long(floor(y_center - f.area_factor * clamped_scales.y + 1)));
auto y_max = min(
input_sizes.z, long(ceil(y_center + f.area_factor * clamped_scales.y)));
auto x_min = max(0L, long(floor(x_center - clamped_scales.x + 1)));
auto x_max = min(input_sizes.w, long(ceil(x_center + clamped_scales.x)));
auto y_min = max(0L, long(floor(y_center - clamped_scales.y + 1)));
auto y_max = min(input_sizes.z, long(ceil(y_center + clamped_scales.y)));
for (int n = 0; n < output_sizes.x; n++) {
for (int c = 0; c < output_sizes.y; c++) {
float res = 0.0;
@ -332,9 +302,9 @@ kernel void upsample_2d_aa(
constant auto* input =
inputData + n * input_strides.x + c * input_strides.y;
for (auto y = y_min; y < y_max; ++y) {
auto dy = f((y - y_center) / clamped_scales.y);
auto dy = bilinear_functor((y - y_center) / clamped_scales.y);
for (auto x = x_min; x < x_max; ++x) {
auto dx = f((x - x_center) / clamped_scales.x);
auto dx = bilinear_functor((x - x_center) / clamped_scales.x);
auto val = input[x * input_strides.w + y * input_strides.z];
res += val * dx * dy;
ws += dx * dy;
@ -486,19 +456,6 @@ kernel void upsample_bicubic2d_backward(
constant bool& align_corners [[buffer(7)]], \
uint thread_index [[thread_position_in_grid]])
#define INSTANTIATE_UPSAMPLE_2D_AA(NAME, FUNCTOR, DTYPE) \
template [[host_name("upsample_" #NAME "_" #DTYPE)]] kernel void \
upsample_2d_aa<DTYPE, FUNCTOR>( \
constant DTYPE * inputData [[buffer(0)]], \
device DTYPE * outputData [[buffer(1)]], \
constant ulong4 & input_strides [[buffer(2)]], \
constant ulong4 & output_strides [[buffer(3)]], \
constant long4 & input_sizes [[buffer(4)]], \
constant long4 & output_sizes [[buffer(5)]], \
constant float2 & scales [[buffer(6)]], \
constant bool& align_corners [[buffer(7)]], \
uint thread_index [[thread_position_in_grid]])
#define INSTANTIATE_UPSAMPLE_2D_BACKWARD(NAME, DTYPE) \
template [[host_name("upsample_" #NAME "_backward_" #DTYPE)]] kernel void \
upsample_##NAME##_backward<DTYPE>( \
@ -525,12 +482,11 @@ kernel void upsample_bicubic2d_backward(
constant bool& align_corners [[buffer(7)]], \
uint thread_index [[thread_position_in_grid]])
#define INSTANTIATE_UPSAMPLE_ALL(DTYPE) \
INSTANTIATE_UPSAMPLE_2D(bicubic2d, DTYPE); \
INSTANTIATE_UPSAMPLE_2D_AA(bicubic2d_aa, BicubicFunctor, DTYPE); \
INSTANTIATE_UPSAMPLE_2D_BACKWARD(bicubic2d, DTYPE); \
INSTANTIATE_UPSAMPLE_2D(bilinear2d, DTYPE); \
INSTANTIATE_UPSAMPLE_2D_AA(bilinear2d_aa, BilinearFunctor, DTYPE); \
#define INSTANTIATE_UPSAMPLE_ALL(DTYPE) \
INSTANTIATE_UPSAMPLE_2D(bicubic2d, DTYPE); \
INSTANTIATE_UPSAMPLE_2D_BACKWARD(bicubic2d, DTYPE); \
INSTANTIATE_UPSAMPLE_2D(bilinear2d, DTYPE); \
INSTANTIATE_UPSAMPLE_2D(bilinear2d_aa, DTYPE); \
INSTANTIATE_UPSAMPLE_LINEAR(DTYPE);
INSTANTIATE_UPSAMPLE_2D(bilinear2d, uchar);

View File

@ -44,8 +44,7 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_attention_math_mps(const Tensor&
TORCH_CHECK(!attn_mask.has_value(),
"_scaled_dot_product_attention: Explicit attn_mask should not be set when is_causal=True");
}
TORCH_CHECK(query.size(-3) == key.size(-3) && key.size(-3) == value.size(-3),
"number of heads in query/key/value should match");
TORCH_CHECK(dropout_p == 0.0, "_scaled_dot_product_attention_math_for_mps: dropout_p != 0.0 is not supported");
TORCH_CHECK(macOS15_0_plus || (query.is_contiguous() && key.is_contiguous() && value.is_contiguous()),
"_scaled_dot_product_attention_math_for_mps: query, key, and value must be contiguous");
@ -56,7 +55,6 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_attention_math_mps(const Tensor&
auto [q_, sq] = ensure_4d(query);
auto [k_, sk] = ensure_4d(key);
auto [v_, sv] = ensure_4d(value);
std::optional<Tensor> mask_;
if (attn_mask) {
auto maskExpandedDims = query.sizes().vec();

View File

@ -23,13 +23,54 @@
#endif
namespace at::native {
namespace mps {
#ifndef PYTORCH_JIT_COMPILE_SHADERS
static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
static auto& lib = MetalShaderLibrary::getBundledLibrary();
#else
#include <ATen/native/mps/BinaryKernel_metallib.h>
#endif
namespace mps {
static void binary_mps_impl(TensorIteratorBase& iter, const std::string func_name, bool supports_dense = true) {
TORCH_CHECK(iter.common_dtype() != at::kDouble, "float64 is not supported on MPS");
Tensor input = iter.input(0);
Tensor other = iter.input(1);
Tensor out = iter.output();
id<MTLDevice> device = MPSDevice::getInstance()->device();
MPSStream* mpsStream = getCurrentMPSStream();
const uint32_t nDim = iter.ndim();
constexpr uint32_t nOffsets = 3;
const uint32_t numThreads = iter.numel();
dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
@autoreleasepool {
auto computeEncoder = mpsStream->commandEncoder();
if (supports_dense && iter.is_contiguous()) {
const auto kernel_name = fmt::format("{}_dense_{}", func_name, scalarToMetalTypeString(input));
auto binaryPSO = lib.getPipelineStateForFunc(kernel_name);
[computeEncoder setComputePipelineState:binaryPSO];
mtl_setArgs(computeEncoder, input, other, out);
mtl_dispatch1DJob(computeEncoder, binaryPSO, numThreads);
return;
}
const std::string kernel = func_name + "_" + scalarToMetalTypeString(input);
auto kernelDataOffsets = generateKernelDataOffsets(computeEncoder, iter);
id<MTLComputePipelineState> binaryPSO = lib.getPipelineStateForFunc(kernel);
// this function call is a no-op if MPS Profiler is not enabled
getMPSProfiler().beginProfileKernel(binaryPSO, kernel, {input, other});
[computeEncoder setComputePipelineState:binaryPSO];
mtl_setArgs(computeEncoder, input, other, out);
[computeEncoder setBuffer:kernelDataOffsets offset:0 atIndex:3];
mtl_dispatch1DJob(computeEncoder, binaryPSO, numThreads);
getMPSProfiler().endProfileKernel(binaryPSO);
}
});
}
void complex_mul_out(const Tensor& input, const Tensor& other, const Tensor& output) {
TORCH_INTERNAL_ASSERT(c10::isComplexType(input.scalar_type()) || c10::isComplexType(other.scalar_type()));
@ -48,43 +89,43 @@ void complex_mul_out(const Tensor& input, const Tensor& other, const Tensor& out
auto iter =
TensorIteratorConfig().add_output(output_as_real).add_input(input_as_real).add_input(other_as_real).build();
lib.exec_binary_kernel(iter, "complex_mul", /*supports_dense=*/false);
mps::binary_mps_impl(iter, "complex_mul", false);
}
} // namespace mps
static void fmax_mps_kernel(TensorIteratorBase& iter) {
if (isFloatingType(iter.common_dtype())) {
lib.exec_binary_kernel(iter, "fmax");
mps::binary_mps_impl(iter, "fmax");
} else {
at::maximum_out(const_cast<Tensor&>(iter.output()), iter.input(0), iter.input(1));
}
}
static void fmin_mps_kernel(TensorIteratorBase& iter) {
if (isFloatingType(iter.common_dtype())) {
lib.exec_binary_kernel(iter, "fmin");
mps::binary_mps_impl(iter, "fmin");
} else {
at::minimum_out(const_cast<Tensor&>(iter.output()), iter.input(0), iter.input(1));
}
}
static void copysign_mps_kernel(TensorIteratorBase& iter) {
lib.exec_binary_kernel(iter, "copysign");
mps::binary_mps_impl(iter, "copysign");
}
static void nextafter_mps_kernel(TensorIteratorBase& iter) {
TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()), "nextafter_mps not implemented for non-floating types");
lib.exec_binary_kernel(iter, "nextafter");
mps::binary_mps_impl(iter, "nextafter");
}
static void zeta_mps_kernel(TensorIteratorBase& iter) {
TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()), "zeta_mps not implemented for non-floating types");
lib.exec_binary_kernel(iter, "zeta");
mps::binary_mps_impl(iter, "zeta");
}
static void xlog1py_mps_kernel(TensorIteratorBase& iter) {
TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()), "xlog1py_mps not implemented for non-floating types");
lib.exec_binary_kernel(iter, "xlog1py");
mps::binary_mps_impl(iter, "xlog1py");
}
REGISTER_DISPATCH(fmax_stub, &fmax_mps_kernel)
@ -106,7 +147,7 @@ Tensor& polar_out_mps(const Tensor& abs, const Tensor& angle, Tensor& output) {
auto output_as_real = at::view_as_real(output).select(output.dim(), 0);
auto iter = TensorIteratorConfig().add_output(output_as_real).add_input(abs).add_input(angle).build();
lib.exec_binary_kernel(iter, "polar");
mps::binary_mps_impl(iter, "polar");
return output;
}
@ -122,7 +163,7 @@ Tensor& complex_out_mps(const Tensor& real, const Tensor& imag, Tensor& output)
auto output_as_real = at::view_as_real(output).select(output.dim(), 0);
auto iter = TensorIteratorConfig().add_output(output_as_real).add_input(real).add_input(imag).build();
lib.exec_binary_kernel(iter, "complex_kernel", /*supports_dense=*/false);
mps::binary_mps_impl(iter, "complex_kernel", false);
return output;
}
} // namespace at::native

View File

@ -14,6 +14,7 @@
#include <ATen/ops/atan2_native.h>
#include <ATen/ops/div_native.h>
#include <ATen/ops/eq_native.h>
#include <ATen/ops/floor_divide_native.h>
#include <ATen/ops/fmod_native.h>
#include <ATen/ops/ge_native.h>
#include <ATen/ops/gt_native.h>
@ -446,8 +447,19 @@ TORCH_IMPL_FUNC(pow_Scalar_out_mps)(const Scalar& base, const Tensor& exp, const
}
}
static void div_floor_kernel_mps(TensorIteratorBase& iter) {
mps::div_mode_template(iter.input(0), iter.input(1), "floor", iter.output(0), "floor_divide_out");
Tensor& floor_divide_out_mps(const Tensor& self, const Tensor& other, Tensor& result) {
mps::div_mode_template(self, other, "floor", result, "floor_divide_out");
return result;
}
Tensor floor_divide_mps(const Tensor& self, const Tensor& other) {
Tensor output = at::empty_like(self);
mps::div_mode_template(self, other, "floor", output, "floor_divide");
return output;
}
Tensor& floor_divide_mps_(Tensor& self, const Tensor& other) {
return floor_divide_out_mps(self, other, self);
}
TORCH_IMPL_FUNC(remainder_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
@ -526,6 +538,4 @@ TORCH_IMPL_FUNC(xlogy_out_mps)(const Tensor& self, const Tensor& other, const Te
TORCH_IMPL_FUNC(lerp_Scalar_mps)(const Tensor& self, const Tensor& end, const Scalar& weight, const Tensor& out) {
mps::add_sub_lerp_template(self, end, weight, out, "lerp");
}
REGISTER_DISPATCH(div_floor_stub, &div_floor_kernel_mps);
} // namespace at::native

View File

@ -60,25 +60,9 @@ static void _fused_sgd_with_momentum_kernel_mps_(TensorList params,
const bool is_first_step,
const std::optional<Tensor>& grad_scale,
const std::optional<Tensor>& found_inf) {
if (lr_tensor.is_cpu()) {
return _fused_sgd_with_momentum_kernel_mps_(params,
grads,
momentum_buffer_list,
weight_decay,
momentum,
lr_tensor.item<double>(),
dampening,
nesterov,
maximize,
is_first_step,
grad_scale,
found_inf);
}
TORCH_CHECK_GT(momentum, 0);
TORCH_CHECK(native::check_fast_path_restrictions({params, grads, momentum_buffer_list}));
TORCH_CHECK(lr_tensor.device() == params[0].device(), "lr must be on the same GPU device as the params");
std::vector<std::vector<Tensor>> tensor_lists{params.vec(), grads.vec(), momentum_buffer_list.vec()};
const auto kernel_name = "fused_sgd_momentum_" + scalarToMetalTypeString(params[0].scalar_type());

View File

@ -16,18 +16,10 @@ static void i0_kernel_mps(TensorIteratorBase& iter) {
lib.exec_unary_kernel(iter, "i0");
}
static void i0e_kernel_mps(TensorIteratorBase& iter) {
lib.exec_unary_kernel(iter, "i0e");
}
static void i1_kernel_mps(TensorIteratorBase& iter) {
lib.exec_unary_kernel(iter, "i1");
}
static void i1e_kernel_mps(TensorIteratorBase& iter) {
lib.exec_unary_kernel(iter, "i1e");
}
static void spherical_bessel_j0_kernel_mps(TensorIteratorBase& iter) {
lib.exec_unary_kernel(iter, "spherical_bessel_j0");
}
@ -36,40 +28,8 @@ static void entr_kernel_mps(TensorIteratorBase& iter) {
lib.exec_unary_kernel(iter, "entr");
}
static void bessel_j0_kernel_mps(TensorIteratorBase& iter) {
lib.exec_unary_kernel(iter, "bessel_j0_forward");
}
static void bessel_j1_kernel_mps(TensorIteratorBase& iter) {
lib.exec_unary_kernel(iter, "bessel_j1_forward");
}
static void modified_bessel_i0_kernel_mps(TensorIteratorBase& iter) {
lib.exec_unary_kernel(iter, "modified_bessel_i0_forward");
}
static void modified_bessel_i1_kernel_mps(TensorIteratorBase& iter) {
lib.exec_unary_kernel(iter, "modified_bessel_i1_forward");
}
static void bessel_y0_kernel_mps(TensorIteratorBase& iter) {
lib.exec_unary_kernel(iter, "bessel_y0_forward");
}
static void bessel_y1_kernel_mps(TensorIteratorBase& iter) {
lib.exec_unary_kernel(iter, "bessel_y1_forward");
}
REGISTER_DISPATCH(i0_stub, &i0_kernel_mps)
REGISTER_DISPATCH(special_i0e_stub, &i0e_kernel_mps)
REGISTER_DISPATCH(special_i1_stub, &i1_kernel_mps)
REGISTER_DISPATCH(special_i1e_stub, &i1e_kernel_mps)
REGISTER_DISPATCH(special_bessel_j0_stub, &bessel_j0_kernel_mps)
REGISTER_DISPATCH(special_bessel_j1_stub, &bessel_j1_kernel_mps)
REGISTER_DISPATCH(special_modified_bessel_i0_stub, &modified_bessel_i0_kernel_mps)
REGISTER_DISPATCH(special_modified_bessel_i1_stub, &modified_bessel_i1_kernel_mps)
REGISTER_DISPATCH(special_bessel_y0_stub, &bessel_y0_kernel_mps)
REGISTER_DISPATCH(special_bessel_y1_stub, &bessel_y1_kernel_mps)
REGISTER_DISPATCH(special_spherical_bessel_j0_stub, &spherical_bessel_j0_kernel_mps)
REGISTER_DISPATCH(special_entr_stub, &entr_kernel_mps)
} // namespace at::native

View File

@ -280,7 +280,7 @@ Tensor& angle_out_mps(const Tensor& self, Tensor& output) {
});
return output;
} else {
TORCH_CHECK(!self.is_complex(), "MPS does not support angle with complex input on macOS13")
TORCH_CHECK(!self.is_complex(), "MPS does not support angle with complex imput on macOS13")
mps::unary_op(self, output, "angle_out_mps", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
// On macOS 13 with non-complex input, realPartOfTensor and imaginaryPartOfTensor are
// not available, and NaN is not propagated correctly:

View File

@ -9,7 +9,6 @@
#include <ATen/Functions.h>
#include <ATen/NativeFunctions.h>
#else
#include <ATen/ops/_upsample_bicubic2d_aa_native.h>
#include <ATen/ops/_upsample_bilinear2d_aa_backward_native.h>
#include <ATen/ops/_upsample_bilinear2d_aa_native.h>
#include <ATen/ops/_upsample_nearest_exact1d.h>
@ -468,16 +467,4 @@ TORCH_IMPL_FUNC(_upsample_bilinear2d_aa_out_mps)
mps::upsample_kernel_out_template(input, output_size, align_corners, scales_h, scales_w, output, "bilinear2d_aa");
}
TORCH_IMPL_FUNC(_upsample_bicubic2d_aa_out_mps)
(const Tensor& input,
IntArrayRef output_size,
bool align_corners,
std::optional<double> scales_h,
std::optional<double> scales_w,
const Tensor& output) {
TORCH_CHECK(at::isFloatingType(input.scalar_type()),
"_upsample_bicubic2d_aa_out_mps only supports floating-point dtypes");
mps::upsample_kernel_out_template(input, output_size, align_corners, scales_h, scales_w, output, "bicubic2d_aa");
}
} // namespace at::native

View File

@ -2749,20 +2749,23 @@
device_check: NoCheck # TensorIterator
variants: function, method
dispatch:
CPU, CUDA, MPS: floor_divide
CPU, CUDA: floor_divide
MPS: floor_divide_mps
SparseCPU, SparseCUDA: floor_divide_sparse
- func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
device_check: NoCheck # TensorIterator
variants: method
dispatch:
CPU, CUDA, MPS: floor_divide_
CPU, CUDA: floor_divide_
MPS: floor_divide_mps_
SparseCPU, SparseCUDA: floor_divide_sparse_
- func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
device_check: NoCheck # TensorIterator
dispatch:
CPU, CUDA, MPS: floor_divide_out
CPU, CUDA: floor_divide_out
MPS: floor_divide_out_mps
SparseCPU, SparseCUDA: floor_divide_out_sparse_zerodim
- func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
@ -12766,7 +12769,6 @@
dispatch:
CPU: _upsample_bicubic2d_aa_out_cpu
CUDA: _upsample_bicubic2d_aa_out_cuda
MPS: _upsample_bicubic2d_aa_out_mps
- func: _upsample_bicubic2d_aa(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
python_module: nn
@ -13498,7 +13500,7 @@
structured: True
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS: special_i0e_out
CPU, CUDA: special_i0e_out
tags: pointwise
- func: special_i1(Tensor self) -> Tensor
@ -13526,7 +13528,7 @@
structured: True
structured_inherits: TensorIteratorBase
dispatch:
CPU, CUDA, MPS: special_i1e_out
CPU, CUDA: special_i1e_out
tags: pointwise
- func: special_logit(Tensor self, float? eps=None) -> Tensor
@ -14988,7 +14990,7 @@
- func: special_bessel_j0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
dispatch:
CPU, CUDA, MPS: special_bessel_j0_out
CPU, CUDA: special_bessel_j0_out
python_module: special
structured_inherits: TensorIteratorBase
structured: True
@ -15003,7 +15005,7 @@
- func: special_bessel_j1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
dispatch:
CPU, CUDA, MPS: special_bessel_j1_out
CPU, CUDA: special_bessel_j1_out
python_module: special
structured_inherits: TensorIteratorBase
structured: True
@ -15018,7 +15020,7 @@
- func: special_bessel_y0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
dispatch:
CPU, CUDA, MPS: special_bessel_y0_out
CPU, CUDA: special_bessel_y0_out
python_module: special
structured_inherits: TensorIteratorBase
structured: True
@ -15033,7 +15035,7 @@
- func: special_bessel_y1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
dispatch:
CPU, CUDA, MPS: special_bessel_y1_out
CPU, CUDA: special_bessel_y1_out
python_module: special
structured_inherits: TensorIteratorBase
structured: True
@ -15440,7 +15442,7 @@
- func: special_modified_bessel_i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
dispatch:
CPU, CUDA, MPS: special_modified_bessel_i0_out
CPU, CUDA: special_modified_bessel_i0_out
python_module: special
structured_inherits: TensorIteratorBase
structured: True
@ -15455,7 +15457,7 @@
- func: special_modified_bessel_i1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
dispatch:
CPU, CUDA, MPS: special_modified_bessel_i1_out
CPU, CUDA: special_modified_bessel_i1_out
python_module: special
structured_inherits: TensorIteratorBase
structured: True

View File

@ -1,359 +0,0 @@
#include <ATen/native/quantized/cpu/ACLUtils.h>
#if AT_MKLDNN_ACL_ENABLED()
#include <ATen/Parallel.h>
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
#else
#include <ATen/ops/empty.h>
#endif
#include <arm_compute/core/Helpers.h>
#include <arm_compute/core/Types.h>
#include <arm_compute/core/Utils.h>
#include <arm_compute/core/utils/quantization/AsymmHelpers.h>
namespace at::native::acl_utils {
QuantMatmul::QuantMatmul(
int64_t weight_dim_0,
int64_t weight_dim_1,
double weight_scale,
int64_t weight_offset,
int8_t* weight_ptr,
std::optional<float*> bias_ptr,
const QuantMatmulCacheKey& cache_key)
: key(cache_key) {
auto wei_q_tensor_info = arm_compute::TensorInfo(
arm_compute::TensorShape(weight_dim_1, weight_dim_0),
1,
arm_compute::DataType::QASYMM8_SIGNED,
arm_compute::QuantizationInfo(weight_scale, -weight_offset, false));
wei_q_tensor_info.set_are_values_constant(true);
wei_q_tensor_.allocator()->init(wei_q_tensor_info);
wei_q_tensor_.allocator()->import_memory(weight_ptr);
if (bias_ptr.has_value()) {
auto bia_tensor_info = arm_compute::TensorInfo(
arm_compute::TensorShape(1, weight_dim_1),
1,
arm_compute::DataType::F32);
bia_tensor_ = arm_compute::Tensor();
bia_tensor_->allocator()->init(bia_tensor_info);
bia_tensor_->allocator()->import_memory(bias_ptr.value());
}
const bool fuse_relu =
std::get<static_cast<int>(QuantMatmulCacheKeyIndex::FUSE_RELU)>(key);
if (fuse_relu) {
relu_info_ =
arm_compute::ActivationLayerInfo(arm_compute::ActivationFunction::RELU);
}
}
QuantMatmul::~QuantMatmul() {
// this will not free memory, it will just tell ACL that we're no longer
// using the pointer
wei_q_tensor_.allocator()->free();
if (bia_tensor_.has_value()) {
bia_tensor_->allocator()->free();
}
}
DynamicQuantMatmul::DynamicQuantMatmul(
int64_t weight_dim_0,
int64_t weight_dim_1,
double weight_scale,
int64_t weight_offset,
int8_t* weight_ptr,
std::optional<float*> bias_ptr,
const QuantMatmulCacheKey& cache_key)
: QuantMatmul(
weight_dim_0,
weight_dim_1,
weight_scale,
weight_offset,
weight_ptr,
bias_ptr,
cache_key) {
int64_t m = std::get<static_cast<int>(QuantMatmulCacheKeyIndex::M)>(key);
auto src_q_tensor_info = arm_compute::TensorInfo(
arm_compute::TensorShape(weight_dim_0, m),
1,
// ACL dyanamically quantized matmuls only support (signed) int8_t
arm_compute::DataType::QASYMM8_SIGNED,
// TODO: setting the initial offset value to int8_t max instead of zero,
// because ACL currently skips MatrixBReduction calculation if the
// source offset at configuration time is zero. This is fixed by this
// PR: https://review.mlplatform.org/c/ml/ComputeLibrary/+/12820/8 This
// will be set to the actual src offset value at runtime.
arm_compute::QuantizationInfo(
/*scale=*/1.0,
/*offset=*/std::numeric_limits<int8_t>::max(),
/*is_dynamic=*/true));
src_q_tensor_info.set_are_values_constant(false);
auto src_tensor_info = arm_compute::TensorInfo(
arm_compute::TensorShape(weight_dim_0, m), arm_compute::Format::F32);
src_tensor_info.set_are_values_constant(false);
auto dst_tensor_info = arm_compute::TensorInfo(
arm_compute::TensorShape(weight_dim_1, m), arm_compute::Format::F32);
src_q_tensor.allocator()->init(src_q_tensor_info);
src_tensor.allocator()->init(src_tensor_info);
dst_tensor.allocator()->init(dst_tensor_info);
src_q_tensor_orig_ =
at::empty({m, weight_dim_0}, at::device(c10::kCPU).dtype(c10::kQInt8));
// allocate/import memory
src_q_tensor.allocator()->import_memory(src_q_tensor_orig_.data_ptr());
if (relu_info_.has_value()) {
relu = arm_compute::NEActivationLayer();
}
}
DynamicQuantMatmul::~DynamicQuantMatmul() {
// this will not free memory, it will just tell ACL that we're no longer
// using the pointer
src_q_tensor.allocator()->free();
}
arm_compute::Status DynamicQuantMatmul::validate() {
if (relu_info_.has_value()) {
auto relu_status = arm_compute::NEActivationLayer::validate(
dst_tensor.info(), dst_tensor.info(), relu_info_.value());
if (relu_status.error_code() != arm_compute::ErrorCode::OK) {
return relu_status;
}
}
auto quant_status = arm_compute::NEQuantizationLayer::validate(
src_tensor.info(), src_q_tensor.info());
if (quant_status.error_code() != arm_compute::ErrorCode::OK) {
return quant_status;
}
return arm_compute::NEGEMMLowpMatrixMultiplyCore::validate(
src_q_tensor.info(),
wei_q_tensor_.info(),
bia_tensor_.has_value() ? bia_tensor_.value().info() : nullptr,
dst_tensor.info(),
gemm_info_);
}
void DynamicQuantMatmul::configure() {
quant.configure(&src_tensor, &src_q_tensor);
gemm.configure(
&src_q_tensor,
&wei_q_tensor_,
bia_tensor_.has_value() ? &bia_tensor_.value() : nullptr,
&dst_tensor,
gemm_info_);
if (relu.has_value()) {
relu->configure(&dst_tensor, &dst_tensor, relu_info_.value());
}
}
StaticQuantMatmul::StaticQuantMatmul(
int64_t weight_dim_0,
int64_t weight_dim_1,
double weight_scale,
int64_t weight_offset,
int8_t* weight_ptr,
std::optional<float*> bias_ptr,
const QuantMatmulCacheKey& cache_key)
: QuantMatmul(
weight_dim_0,
weight_dim_1,
weight_scale,
weight_offset,
weight_ptr,
bias_ptr,
cache_key) {
const int64_t m =
std::get<static_cast<int>(QuantMatmulCacheKeyIndex::M)>(key);
const int64_t input_zero_point =
std::get<static_cast<int>(QuantMatmulCacheKeyIndex::INPUT_OFFSET)>(key);
const double input_scale =
std::get<static_cast<int>(QuantMatmulCacheKeyIndex::INPUT_SCALE)>(key);
const int64_t output_zero_point =
std::get<static_cast<int>(QuantMatmulCacheKeyIndex::OUTPUT_OFFSET)>(key);
const double output_scale =
std::get<static_cast<int>(QuantMatmulCacheKeyIndex::OUTPUT_SCALE)>(key);
const bool signed_input =
std::get<static_cast<int>(QuantMatmulCacheKeyIndex::SIGNED_INPUT)>(key);
const auto input_acl_datatype = signed_input
? arm_compute::DataType::QASYMM8_SIGNED
: arm_compute::DataType::QASYMM8;
auto src_q_tensor_info = arm_compute::TensorInfo(
arm_compute::TensorShape(weight_dim_0, m),
1,
input_acl_datatype,
arm_compute::QuantizationInfo(input_scale, -input_zero_point, false));
src_q_tensor_info.set_are_values_constant(false);
src_q_tensor.allocator()->init(src_q_tensor_info);
if (bias_ptr.has_value()) {
auto bia_q_tensor_info = arm_compute::TensorInfo(
arm_compute::TensorShape(1, weight_dim_1),
1,
arm_compute::DataType::S32,
arm_compute::QuantizationInfo(
1 / (input_scale * weight_scale), 0, false));
bia_q_tensor_ = arm_compute::Tensor();
bia_q_tensor_.value().allocator()->init(bia_q_tensor_info);
float* bias_fp32_buffer = (float*)bia_tensor_.value().buffer();
bia_q_tensor_orig_ =
at::empty({m, weight_dim_0}, at::device(c10::kCPU).dtype(c10::kQInt32));
int32_t* bias_s32_buffer = (int32_t*)bia_q_tensor_orig_.value().data_ptr();
const float bias_scale =
bia_q_tensor_info.quantization_info().uniform().scale;
// Quantize the bias to int32_t. It makes sense to do it here rather in the
// prepack phase because dynamically quantized ACL matmuls don't need the
// bias in int32_t.
at::parallel_for(0, weight_dim_1, 1, [&](int64_t start, int64_t end) {
for (int64_t i = start; i < end; ++i) {
bias_s32_buffer[i] =
int32_t(std::round(bias_fp32_buffer[i] * bias_scale));
}
});
bia_q_tensor_.value().allocator()->import_memory(bias_s32_buffer);
}
auto dst_q_tensor_info = arm_compute::TensorInfo(
arm_compute::TensorShape(weight_dim_1, m),
1,
input_acl_datatype,
arm_compute::QuantizationInfo(output_scale, output_zero_point, false));
dst_q_tensor.allocator()->init(dst_q_tensor_info);
// Setup lowp_gemm output stage
int output_multiplier;
int output_shift;
float multiplier = (input_scale * weight_scale) / output_scale;
arm_compute::quantization::calculate_quantized_multiplier_less_than_one(
multiplier, &output_multiplier, &output_shift);
arm_compute::GEMMLowpOutputStageInfo output_stage_info;
output_stage_info.type =
arm_compute::GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
output_stage_info.gemmlowp_multiplier = output_multiplier;
output_stage_info.gemmlowp_shift = output_shift;
output_stage_info.gemmlowp_offset = output_zero_point;
int32_t min_activation = signed_input ? std::numeric_limits<int8_t>::min()
: std::numeric_limits<uint8_t>::min();
int32_t max_activation = signed_input ? std::numeric_limits<int8_t>::max()
: std::numeric_limits<uint8_t>::max();
if (relu_info_.has_value()) {
// figure out min, max values for ReLU
const arm_compute::UniformQuantizationInfo uqinfo =
dst_q_tensor_info.quantization_info().uniform();
std::tie(min_activation, max_activation) =
arm_compute::get_quantized_activation_min_max(
relu_info_.value(), src_q_tensor_info.data_type(), uqinfo);
// fuse ReLU with the GEMM
gemm_info_.set_activation_info(relu_info_.value());
}
output_stage_info.gemmlowp_min_bound = min_activation;
output_stage_info.gemmlowp_max_bound = max_activation;
output_stage_info.output_data_type = dst_q_tensor_info.data_type();
gemm_info_.set_gemmlowp_output_stage(output_stage_info);
}
StaticQuantMatmul::~StaticQuantMatmul() {
// this will not free memory, it will just tell ACL that we're no longer
// using the pointer
if (bia_q_tensor_.has_value()) {
bia_q_tensor_.value().allocator()->free();
}
}
arm_compute::Status StaticQuantMatmul::validate() {
return arm_compute::NEGEMMLowpMatrixMultiplyCore::validate(
src_q_tensor.info(),
wei_q_tensor_.info(),
bia_q_tensor_.has_value() ? bia_q_tensor_.value().info() : nullptr,
dst_q_tensor.info(),
gemm_info_);
}
void StaticQuantMatmul::configure() {
gemm.configure(
&src_q_tensor,
&wei_q_tensor_,
bia_q_tensor_.has_value() ? &bia_q_tensor_.value() : nullptr,
&dst_q_tensor,
gemm_info_);
}
QuantAdd::QuantAdd(
arm_compute::DataType dtype,
const std::vector<int64_t>& input_dims,
double qa_scale,
int64_t qa_offset,
double qb_scale,
int64_t qb_offset,
double dst_scale,
int64_t dst_offset) {
arm_compute::QuantizationInfo qa_qinfo = {
static_cast<float>(qa_scale), static_cast<int32_t>(qa_offset), false};
arm_compute::QuantizationInfo qb_qinfo = {
static_cast<float>(qb_scale), static_cast<int32_t>(qb_offset), false};
arm_compute::QuantizationInfo qdst_qinfo = {
static_cast<float>(dst_scale), static_cast<int32_t>(dst_offset), false};
arm_compute::TensorShape qa_acl_tensor_shape;
arm_compute::TensorShape qb_acl_tensor_shape;
arm_compute::TensorShape qdst_acl_tensor_shape;
for (int i = input_dims.size() - 1; i >= 0; i--) {
qa_acl_tensor_shape.set(i, input_dims[i], false, true);
qb_acl_tensor_shape.set(i, input_dims[i], false, true);
qdst_acl_tensor_shape.set(i, input_dims[i], false, true);
}
arm_compute::TensorInfo qa_acl_tensor_info(
qa_acl_tensor_shape, 1, dtype, qa_qinfo);
arm_compute::TensorInfo qb_acl_tensor_info(
qb_acl_tensor_shape, 1, dtype, qb_qinfo);
arm_compute::TensorInfo qdst_acl_tensor_info(
qdst_acl_tensor_shape, 1, dtype, qdst_qinfo);
qa_tensor.allocator()->init(qa_acl_tensor_info);
qb_tensor.allocator()->init(qb_acl_tensor_info);
qdst_tensor.allocator()->init(qdst_acl_tensor_info);
}
arm_compute::Status QuantAdd::validate() {
return q_add.validate(
qa_tensor.info(), qb_tensor.info(), qdst_tensor.info(), policy);
}
void QuantAdd::configure() {
q_add.configure(&qa_tensor, &qb_tensor, &qdst_tensor, policy);
}
} // namespace at::native::acl_utils
PackedLinearWeightsACL::PackedLinearWeightsACL(
std::unique_ptr<ideep::tensor> weight,
std::optional<ideep::tensor> bias,
at::Tensor orig_weight,
std::optional<at::Tensor> orig_bias)
: PackedLinearWeightsOnednn(
std::move(weight),
std::move(bias),
std::move(orig_weight),
std::move(orig_bias)) {
auto w = *(weight_.get());
k_ = w.get_dim(0);
n_ = w.get_dim(1);
weight_zero_point_ = orig_weight_.q_zero_point();
weight_scale_ = orig_weight_.q_scale();
}
#endif // AT_MKLDNN_ACL_ENABLED()

View File

@ -1,257 +0,0 @@
#pragma once
#include <ATen/Config.h>
#if AT_MKLDNN_ACL_ENABLED()
#include <ATen/native/quantized/cpu/OnednnUtils.h>
#include <arm_compute/core/Error.h>
#include <arm_compute/core/TensorInfo.h>
#include <arm_compute/function_info/ActivationLayerInfo.h>
#include <arm_compute/runtime/NEON/functions/NEActivationLayer.h>
#include <arm_compute/runtime/NEON/functions/NEArithmeticAddition.h>
#include <arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h>
#include <arm_compute/runtime/NEON/functions/NEQuantizationLayer.h>
#include <arm_compute/runtime/Tensor.h>
#include <array>
// Utilities for Arm Compute Library (ACL) quantized operations
// Provides interfaces to leverage ACL's accelerated kernels for statically and
// dynamically quantized matmuls (i.e. qlinear and qlinear_dynamic) These are
// utalized through PackedLinearWeightsACL which extends
// PackedLinearWeightsOnednn Note that PackedLinearWeightsACL extends rather
// than replaces PackedLinearWeightsOnednn for AArch64 because ACL currently
// only supports per_tensor weight quantization.
namespace at::native::acl_utils {
using QuantMatmulCacheKey = std::tuple<
int64_t, // M
bool, // FUSE_RELU
int64_t, // NUM_THREADS
double, // INPUT_SCALE
int64_t, // INPUT_OFFSET
double, // OUTPUT_SCALE
int64_t, // OUTPUT_OFFSET
bool // SIGNED_INPUT
>;
enum class QuantMatmulCacheKeyIndex {
M,
FUSE_RELU,
NUM_THREADS,
INPUT_SCALE,
INPUT_OFFSET,
OUTPUT_SCALE,
OUTPUT_OFFSET,
SIGNED_INPUT
};
// Abstract interface to share common stuff between static/dynamic ACL matmuls.
struct QuantMatmul {
arm_compute::NEGEMMLowpMatrixMultiplyCore gemm;
// key for use in the cache
QuantMatmulCacheKey key;
QuantMatmul(
int64_t weight_dim_0,
int64_t weight_dim_1,
double weight_scale,
int64_t weight_offset,
int8_t* weight_ptr,
std::optional<float*> bias_ptr,
const QuantMatmulCacheKey& cache_key);
virtual ~QuantMatmul();
virtual arm_compute::Status validate() = 0;
virtual void configure() = 0;
protected:
arm_compute::Tensor wei_q_tensor_;
std::optional<arm_compute::Tensor> bia_tensor_;
arm_compute::GEMMInfo gemm_info_;
std::optional<arm_compute::ActivationLayerInfo> relu_info_;
};
struct DynamicQuantMatmul : public QuantMatmul {
arm_compute::Tensor src_q_tensor;
arm_compute::Tensor src_tensor;
arm_compute::Tensor dst_tensor;
arm_compute::NEQuantizationLayer quant;
// We need a ReLU layer here (unlike static quantization) because the ReLU
// cannot be "truly" fused with the GEMM through gemm_info in ACL dynamically
// quantized matmuls.
std::optional<arm_compute::NEActivationLayer> relu;
DynamicQuantMatmul(
int64_t weight_dim_0,
int64_t weight_dim_1,
double weight_scale,
int64_t weight_offset,
int8_t* weight_ptr,
std::optional<float*> bias_ptr,
const QuantMatmulCacheKey& cache_key);
~DynamicQuantMatmul() override;
arm_compute::Status validate() override;
void configure() override;
private:
at::Tensor src_q_tensor_orig_;
};
struct StaticQuantMatmul : public QuantMatmul {
arm_compute::Tensor src_q_tensor;
arm_compute::Tensor dst_q_tensor;
StaticQuantMatmul(
int64_t weight_dim_0,
int64_t weight_dim_1,
double weight_scale,
int64_t weight_offset,
int8_t* weight_ptr,
std::optional<float*> bias_ptr,
const QuantMatmulCacheKey& cache_key);
~StaticQuantMatmul() override;
arm_compute::Status validate() override;
void configure() override;
private:
std::optional<arm_compute::Tensor> bia_q_tensor_;
std::optional<at::Tensor> bia_q_tensor_orig_;
};
struct QuantAdd {
arm_compute::Tensor qa_tensor;
arm_compute::Tensor qb_tensor;
arm_compute::Tensor qdst_tensor;
arm_compute::NEArithmeticAddition q_add;
QuantAdd(
arm_compute::DataType dtype,
const std::vector<int64_t>& input_dims,
double qa_scale,
int64_t qa_offset,
double qb_scale,
int64_t qb_offset,
double dst_scale,
int64_t dst_offset);
arm_compute::Status validate();
void configure();
private:
arm_compute::ConvertPolicy policy{arm_compute::ConvertPolicy::SATURATE};
};
} // namespace at::native::acl_utils
struct PackedLinearWeightsACL : public PackedLinearWeightsOnednn {
using ACLQuantMatmul = at::native::acl_utils::QuantMatmul;
using ACLDynamicQuantMatmul = at::native::acl_utils::DynamicQuantMatmul;
using ACLStaticQuantMatmul = at::native::acl_utils::StaticQuantMatmul;
using ACLQuantMatmulCacheKey = at::native::acl_utils::QuantMatmulCacheKey;
using ACLQuantMatmulCacheKeyIndex =
at::native::acl_utils::QuantMatmulCacheKeyIndex;
PackedLinearWeightsACL(
std::unique_ptr<ideep::tensor> weight,
std::optional<ideep::tensor> bias,
at::Tensor orig_weight,
std::optional<at::Tensor> orig_bias);
at::Tensor apply_dynamic(at::Tensor input, bool reduce_range = false)
override;
at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range = false)
override;
at::Tensor apply(
at::Tensor input,
double output_scale,
int64_t output_zero_point) override;
at::Tensor apply_relu(
at::Tensor input,
double output_scale,
int64_t output_zero_point) override;
template <typename ACLQuantMatmulT>
std::shared_ptr<ACLQuantMatmulT> get_acl_quant_matmul(
const ACLQuantMatmulCacheKey& key) {
return std::dynamic_pointer_cast<ACLQuantMatmulT>(
fetch_or_create_acl_quant_matmul<ACLQuantMatmulT>(key));
}
private:
int64_t k_;
int64_t n_;
int64_t weight_zero_point_;
double weight_scale_;
// A 2 element (per layer) cache. Given it's not intended to store more than 2
// elements, we do not need a fancy implementation. The idea behind it is to
// allow for a (configuration free) fast path for autoregressive
// transformer-like models which usually involve 2 input tensor shapes; one
// for the prefill phase and another for the autoregressive phase
std::array<std::shared_ptr<ACLQuantMatmul>, 2> cache_;
template <typename ACLQuantMatmulT>
std::shared_ptr<ACLQuantMatmul> fetch_or_create_acl_quant_matmul(
const ACLQuantMatmulCacheKey& key) {
// We're only maintaining a 2 element LRU cache
// hit first
if (cache_[0] != nullptr && cache_[0]->key == key) {
return cache_[0];
}
// hit second
if (cache_[1] != nullptr && cache_[1]->key == key) {
// Update LRU
std::swap(cache_[0], cache_[1]);
return cache_[0];
}
// miss -> replace Least Recently Used - i.e. element at index 1
cache_[1] = create_acl_quant_matmul<ACLQuantMatmulT>(key);
std::swap(cache_[0], cache_[1]);
return cache_[0];
}
template <typename ACLQuantMatmulT>
std::shared_ptr<ACLQuantMatmulT> create_acl_quant_matmul(
const ACLQuantMatmulCacheKey& key) {
std::optional<float*> bias_ptr;
if (bias_.has_value()) {
bias_ptr = (float*)bias_.value().get_data_handle();
}
auto acl_gemm = std::make_shared<ACLQuantMatmulT>(
k_,
n_,
weight_scale_,
weight_zero_point_,
(int8_t*)weight_.get()->get_data_handle(),
bias_ptr,
key);
// validate
auto status = acl_gemm->validate();
if (status.error_code() != arm_compute::ErrorCode::OK) {
TORCH_WARN(
"Arm Compute Library's Quantized Matmul Validation Failed: " +
status.error_description());
return nullptr;
}
// configure
acl_gemm->configure();
return acl_gemm;
}
template <bool ReluFused>
at::Tensor apply_dynamic_impl(at::Tensor input, bool reduce_range = false);
template <bool ReluFused>
at::Tensor apply_impl(
at::Tensor input,
double output_scale,
int64_t output_zero_point);
};
#endif // AT_MKLDNN_ACL_ENABLED()

View File

@ -5,7 +5,6 @@
#include <ATen/ExpandUtils.h>
#include <torch/library.h>
#include <ATen/quantized/Quantizer.h>
#include <ATen/native/quantized/cpu/ACLUtils.h>
#include <ATen/native/quantized/cpu/BinaryOps.h>
#include <ATen/native/quantized/cpu/QuantizedOps.h>
#include <ATen/native/quantized/cpu/init_qnnpack.h>
@ -385,67 +384,6 @@ Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
}
#endif // USE_XNNPACK
#if AT_MKLDNN_ACL_ENABLED()
Tensor acl_qadd(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
TORCH_CHECK(
qa.qscheme() == kPerTensorAffine || qa.qscheme() == kPerTensorSymmetric,
"Only per tensor quantization is supported in ACL quantized add.");
Tensor qa_contig = qa.contiguous(qa.suggest_memory_format());
Tensor qb_contig = qb.contiguous(qa.suggest_memory_format());
auto qa_mem_format = qa_contig.suggest_memory_format();
Tensor dst = at::native::empty_affine_quantized(
at::infer_size_dimvector(qa_contig.sizes(), qb_contig.sizes()),
qa_contig.scalar_type(),
std::nullopt /* layout */,
kCPU,
std::nullopt /* pin_memory */,
scale,
zero_point,
qa_mem_format);
if (qb_contig.size(0) == 0) {
return dst;
}
auto input_dims = qa_contig.sizes().vec();
auto acl_dtype = dst.scalar_type() == kQInt8
? arm_compute::DataType::QASYMM8_SIGNED
: arm_compute::DataType::QASYMM8;
auto acl_add = std::make_shared<acl_utils::QuantAdd>(
acl_dtype,
input_dims,
qa_contig.q_scale(),
qa_contig.q_zero_point(),
qb_contig.q_scale(),
qb_contig.q_zero_point(),
dst.q_scale(),
dst.q_zero_point());
auto status = acl_add->validate();
TORCH_CHECK(
status.error_code() == arm_compute::ErrorCode::OK,
"Arm Compute Library's Quantized Matmul Validation Failed: " +
status.error_description());
acl_add->configure();
acl_add->qa_tensor.allocator()->import_memory(qa_contig.data_ptr());
acl_add->qb_tensor.allocator()->import_memory(qb_contig.data_ptr());
acl_add->qdst_tensor.allocator()->import_memory(dst.data_ptr());
acl_add->q_add.run();
// this will not free memory, it will just tell ACL that we're no longer
// using the pointer
acl_add->qa_tensor.allocator()->free();
acl_add->qb_tensor.allocator()->free();
acl_add->qdst_tensor.allocator()->free();
return dst;
}
#endif // AT_MKLDNN_ACL_ENABLED()
template <bool ReLUFused = false>
Tensor qadd(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
check_inputs(qa, qb);
@ -468,15 +406,6 @@ Tensor qadd(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
}
#endif // USE_PYTORCH_QNNPACK
}
#if AT_MKLDNN_ACL_ENABLED()
if (!ReLUFused && qa.ndimension() > 0 && qa.sizes() == qb.sizes() &&
qa.scalar_type() == qb.scalar_type() &&
(qa.scalar_type() == kQInt8 || qa.scalar_type() == kQUInt8)) {
return acl_qadd(qa, qb, scale, zero_point);
}
#endif // AT_MKLDNN_ACL_ENABLED()
auto qc = at::_empty_affine_quantized(
qa.sizes(),
at::device(kCPU)

View File

@ -1,18 +1,17 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/core/Tensor.h>
#include <ATen/Context.h>
#include <ATen/Parallel.h>
#include <ATen/TensorOperators.h>
#include <ATen/core/Tensor.h>
#include <ATen/native/mkldnn/MKLDNNCommon.h>
#include <ATen/native/quantized/PackedParams.h>
#include <ATen/native/quantized/cpu/ACLUtils.h>
#include <ATen/native/quantized/cpu/OnednnUtils.h>
#include <ATen/native/quantized/cpu/QnnpackUtils.h>
#include <ATen/native/quantized/cpu/QuantUtils.h>
#include <ATen/native/quantized/cpu/XnnpackUtils.h>
#include <ATen/native/quantized/cpu/fbgemm_utils.h>
#include <ATen/native/quantized/cpu/QnnpackUtils.h>
#include <ATen/native/quantized/cpu/XnnpackUtils.h>
#include <ATen/native/quantized/cpu/OnednnUtils.h>
#include <ATen/native/quantized/cpu/QuantUtils.h>
#include <ATen/native/quantized/cpu/qlinear.h>
#include <ATen/native/quantized/library.h>
#include <ATen/native/quantized/PackedParams.h>
#include <ATen/native/mkldnn/MKLDNNCommon.h>
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
#include <torch/library.h>
@ -1108,96 +1107,6 @@ static at::Tensor linear_int8_with_onednn_weight(
primitive.execute(ideep::stream::default_stream(), args);
return dim == 2 ? output : output.reshape(output_size);
}
#if AT_MKLDNN_ACL_ENABLED()
template <bool ReluFused>
at::Tensor PackedLinearWeightsACL::apply_impl(
at::Tensor input,
double output_scale,
int64_t output_zero_point) {
const int64_t dim = input.dim();
TORCH_CHECK(
dim != 0, "qlinear (ACL): input dim should be at least 1, but got 0");
TORCH_CHECK(
input.scalar_type() == c10::ScalarType::QUInt8 ||
input.scalar_type() == c10::ScalarType::QInt8,
"qlinear (ACL): data type of input should be QUInt8 or QInt8.");
auto input_contig = input.expect_contiguous();
int64_t m = input.numel() / k_;
double input_scale = input.q_scale();
int64_t input_zero_point = input.q_zero_point();
auto is_input_qint8 = input.scalar_type() == c10::ScalarType::QInt8;
auto key = std::make_tuple(
m,
ReluFused,
static_cast<int64_t>(at::get_num_threads()),
input_scale,
input_zero_point,
output_scale,
output_zero_point,
is_input_qint8);
auto acl_gemm =
get_acl_quant_matmul<at::native::acl_utils::StaticQuantMatmul>(key);
if (acl_gemm) {
acl_gemm->src_q_tensor.allocator()->import_memory(input_contig->data_ptr());
auto dst_dims = {m, n_};
at::Tensor output = at::_empty_affine_quantized(
dst_dims,
at::device(c10::kCPU).dtype(
is_input_qint8 ? c10::kQInt8 : c10::kQUInt8),
output_scale,
output_zero_point);
if (output.numel() == 0) {
return output;
}
acl_gemm->dst_q_tensor.allocator()->import_memory(output.data_ptr());
acl_gemm->gemm.run();
acl_gemm->src_q_tensor.allocator()->free();
acl_gemm->dst_q_tensor.allocator()->free();
auto out_sizes = input.sizes().vec();
out_sizes.back() = n_;
if (output.sizes().vec() == out_sizes)
return output;
return output.reshape(out_sizes);
}
// fallback to oneDNN in the unlikely scinario that ACL's validation fails
if (ReluFused) {
return PackedLinearWeightsOnednn::apply_relu(
input, output_scale, output_zero_point);
} else {
return PackedLinearWeightsOnednn::apply(
input, output_scale, output_zero_point);
}
}
at::Tensor PackedLinearWeightsACL::apply(
at::Tensor input,
double output_scale,
int64_t output_zero_point) {
return apply_impl</*ReluFused=*/false>(
std::move(input), output_scale, output_zero_point);
}
at::Tensor PackedLinearWeightsACL::apply_relu(
at::Tensor input,
double output_scale,
int64_t output_zero_point) {
return apply_impl</*ReluFused=*/true>(
std::move(input), output_scale, output_zero_point);
}
#endif // AT_MKLDNN_ACL_ENABLED()
#endif // #if AT_MKLDNN_ENABLED()
namespace at::native {

View File

@ -5,7 +5,6 @@
#include <ATen/native/quantized/cpu/fbgemm_utils.h>
#include <ATen/native/quantized/cpu/QnnpackUtils.h>
#include <ATen/native/quantized/cpu/OnednnUtils.h>
#include <ATen/native/quantized/cpu/ACLUtils.h>
#include <ATen/native/quantized/cpu/QuantUtils.h>
#include <ATen/native/quantized/library.h>
#include <ATen/native/quantized/PackedParams.h>
@ -698,135 +697,6 @@ static at::Tensor linear_dynamic_fp16_with_onednn_weight(
primitive.execute(ideep::stream::default_stream(), args);
return dim == 2 ? output : output.reshape(output_size);
}
#if AT_MKLDNN_ACL_ENABLED()
template <bool ReluFused>
at::Tensor PackedLinearWeightsACL::apply_dynamic_impl(
at::Tensor input,
bool reduce_range) {
// Dynamic: fp32 * int8 -> fp32
using at::Tensor;
TORCH_CHECK(
input.dim() >= 2,
"The dimension of input tensor should be larger than or equal to 2");
TORCH_CHECK(
input.scalar_type() == c10::ScalarType::Float,
"qlinear_dynamic (ACL): data type of input should be float.");
auto input_contig = input.contiguous();
const int64_t dim = input.dim();
auto input_reshaped =
dim == 2 ? input : input.reshape({-1, input.size(input.dim() - 1)});
auto input_dims = input_reshaped.sizes().vec();
int64_t m = input_dims[0];
auto key = std::make_tuple(
m, /* M */
ReluFused, /* FUSE_RELU */
static_cast<int64_t>(at::get_num_threads()), /* NUM_THREADS */
1, /* INPUT_SCALE */
0, /* INPUT_OFFSET */
1, /* OUTPUT_SCALE */
0, /* OUTPUT_OFFSET */
true /* SIGNED_INPUT */
);
auto acl_gemm =
get_acl_quant_matmul<at::native::acl_utils::DynamicQuantMatmul>(key);
if (acl_gemm) {
// Find quantization parameters
float x_max = 0, x_min = 0;
#ifdef USE_FBGEMM
// Use FBGEMM's FindMinMax if available since it's faster
fbgemm::FindMinMax(
/*m=*/input_contig.data_ptr<float>(),
/*min=*/&x_min,
/*max=*/&x_max,
/*len=*/input.numel());
#else
if (input_contig.numel() > 0) {
auto [t_min, t_max] = at::aminmax(input_contig);
x_max = t_max.item<float>();
x_min = t_min.item<float>();
}
#endif
auto q_params = quant_utils::ChooseQuantizationParams(
/*min=*/x_min,
/*max=*/x_max,
/*qmin=*/std::numeric_limits<int8_t>::min(),
/*qmax=*/std::numeric_limits<int8_t>::max(),
/*preserve_sparsity=*/false,
/*force_scale_power_of_two=*/false,
/*reduce_range=*/reduce_range);
acl_gemm->src_tensor.allocator()->import_memory(
(float*)input_contig.data_ptr());
acl_gemm->src_q_tensor.info()->set_quantization_info(
arm_compute::QuantizationInfo(
q_params.scale, q_params.zero_point, true));
// quantize src tensor: fp32 -> s8
acl_gemm->quant.run();
// allocation for fp32 out tensor
auto output = at::empty({m, n_}, input.options().dtype(at::kFloat));
if (output.numel() == 0)
return output;
// We set the offset to "-zero_point" for the GEMM, but to "zero_point" for
// the quantization layer This is a known inconsistency in ACL.
acl_gemm->src_q_tensor.info()->set_quantization_info(
arm_compute::QuantizationInfo(
q_params.scale, -q_params.zero_point, true));
acl_gemm->dst_tensor.allocator()->import_memory((float*)output.data_ptr());
// s8 src, s8 wei -> f32 dst
acl_gemm->gemm.run();
if (acl_gemm->relu.has_value()) {
acl_gemm->relu->run();
}
// this will not free memory, it will just tell ACL that we're no longer
// using the pointer
acl_gemm->src_tensor.allocator()->free();
acl_gemm->dst_tensor.allocator()->free();
auto out_sizes = input.sizes().vec();
out_sizes.back() = n_;
if (output.sizes().vec() == out_sizes)
return output;
return output.reshape(out_sizes);
}
// fallback to oneDNN in the unlikely scinario that ACL's validation fails
if (ReluFused) {
return PackedLinearWeightsOnednn::apply_dynamic_relu(input, reduce_range);
} else {
return PackedLinearWeightsOnednn::apply_dynamic(input, reduce_range);
}
}
at::Tensor PackedLinearWeightsACL::apply_dynamic(
at::Tensor input,
bool reduce_range) {
return apply_dynamic_impl</*ReluFused=*/false>(
std::move(input), reduce_range);
}
at::Tensor PackedLinearWeightsACL::apply_dynamic_relu(
at::Tensor input,
bool reduce_range) {
return apply_dynamic_impl</*ReluFused=*/true>(std::move(input), reduce_range);
}
#endif // #if AT_MKLDNN_ACL_ENABLED()
#endif // #if AT_MKLDNN_ENABLED()
namespace at::native {

View File

@ -1,16 +1,15 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/Context.h>
#include <ATen/core/Tensor.h>
#include <ATen/cpp_custom_type_hack.h>
#include <ATen/native/mkldnn/MKLDNNCommon.h>
#include <ATen/native/quantized/PackedParams.h>
#include <ATen/native/quantized/cpu/ACLUtils.h>
#include <ATen/native/quantized/cpu/OnednnUtils.h>
#include <ATen/native/quantized/cpu/QnnpackUtils.h>
#include <ATen/native/quantized/cpu/QuantUtils.h>
#include <ATen/Context.h>
#include <ATen/native/quantized/cpu/fbgemm_utils.h>
#include <ATen/native/quantized/cpu/init_qnnpack.h>
#include <ATen/native/quantized/cpu/QnnpackUtils.h>
#include <ATen/native/quantized/cpu/OnednnUtils.h>
#include <ATen/native/quantized/cpu/QuantUtils.h>
#include <ATen/native/quantized/library.h>
#include <ATen/native/quantized/PackedParams.h>
#include <ATen/native/mkldnn/MKLDNNCommon.h>
#include <ATen/quantized/Quantizer.h>
#include <torch/custom_class.h>
#include <torch/library.h>
@ -280,15 +279,12 @@ c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeightsOnednn::prepack(
packed_bias.init(bias_desc, b.data_ptr());
onednn_bias = std::optional<ideep::tensor>(packed_bias);
}
#if AT_MKLDNN_ACL_ENABLED()
if (qtype == c10::kPerTensorAffine) {
return c10::make_intrusive<PackedLinearWeightsACL>(PackedLinearWeightsACL{
std::move(weight_ptr), onednn_bias, weight, bias});
}
#endif // #if AT_MKLDNN_ACL_ENABLED()
auto ret_ptr =
c10::make_intrusive<PackedLinearWeightsOnednn>(PackedLinearWeightsOnednn{
std::move(weight_ptr), onednn_bias, weight, bias});
auto ret_ptr = c10::make_intrusive<PackedLinearWeightsOnednn>(
PackedLinearWeightsOnednn{
std::move(weight_ptr),
onednn_bias,
weight,
bias});
return ret_ptr;
}

View File

@ -759,28 +759,6 @@ Tensor scaled_dot_product_attention(
&& !(GradMode::is_enabled() && any_inputs_require_grad)
&& (all_contiguous || mps::is_macos_13_or_newer(mps::MacOSVersion::MACOS_VER_15_0_PLUS))
&& !any_nested) {
if (enable_gqa) {
int64_t q_heads = query_.size(-3);
int64_t k_heads = key.size(-3);
int64_t repeat_factor = q_heads / k_heads;
if (repeat_factor > 1) {
TORCH_CHECK(q_heads % k_heads == 0,
"For GQA, the query tensor's head dimension (" + std::to_string(q_heads) +
") must be divisible by the key tensor's head dimension (" + std::to_string(k_heads) + ").");
auto repeated_key = key.repeat_interleave(repeat_factor, /*dim=*/-3);
auto repeated_value = value.repeat_interleave(repeat_factor, /*dim=*/-3);
return std::get<0>(at::_scaled_dot_product_attention_math_for_mps(
query_,
repeated_key,
repeated_value,
attn_mask,
dropout_p,
is_causal,
std::nullopt, /*dropout_mask*/
scale));
}
}
return std::get<0>(at::_scaled_dot_product_attention_math_for_mps(
query_,
key,

View File

@ -1408,7 +1408,7 @@ class AOTInductorModelCache:
def load(cls, model, example_inputs):
import torch._inductor
import torch.export._trace
from torch.export.dynamic_shapes import _combine_args, _tree_map_with_path
from torch.export.dynamic_shapes import _tree_map_with_path
key = weakref.ref(model)
if key not in cls.cache:
@ -1428,7 +1428,7 @@ class AOTInductorModelCache:
else:
_register_dataclass_output_as_pytree(example_outputs)
combined_args = _combine_args(model, example_args, example_kwargs)
combined_args = tuple(example_args) + tuple(example_kwargs.values())
dynamic_shapes = _tree_map_with_path(
_produce_dynamic_shapes_for_export, combined_args
)
@ -1449,13 +1449,13 @@ class AOTInductorModelCache:
def export(model, example_inputs):
from torch.export.dynamic_shapes import _combine_args, _tree_map_with_path
from torch.export.dynamic_shapes import _tree_map_with_path
example_args, example_kwargs = _normalize_bench_inputs(example_inputs)
example_outputs = model(*example_args, **example_kwargs)
_register_dataclass_output_as_pytree(example_outputs)
combined_args = _combine_args(model, example_args, example_kwargs)
combined_args = tuple(example_args) + tuple(example_kwargs.values())
dynamic_shapes = _tree_map_with_path(
_produce_dynamic_shapes_for_export, combined_args
)

View File

@ -369,10 +369,7 @@ class HuggingfaceRunner(BenchmarkRunner):
return self._skip["control_flow"]
def use_larger_multiplier_for_smaller_tensor(self, name):
return name in [
"ElectraForQuestionAnswering",
"MegatronBertForQuestionAnswering",
]
return name in ["ElectraForQuestionAnswering"]
def _get_model_cls_and_config(self, model_name):
if model_name not in EXTRA_MODELS:

View File

@ -6,7 +6,7 @@ add_loop_eager_dynamic,compile_time_instruction_count,5460000000,0.025
add_loop_inductor,compile_time_instruction_count,27660000000,0.015
add_loop_inductor,compile_time_instruction_count,27520000000,0.015
@ -22,11 +22,11 @@ basic_modules_ListOfLinears_eager,compile_time_instruction_count,953800000,0.015
basic_modules_ListOfLinears_inductor,compile_time_instruction_count,17190000000,0.015
basic_modules_ListOfLinears_inductor,compile_time_instruction_count,17070000000,0.015
basic_modules_ListOfLinears_inductor_gpu_force_shape_pad,compile_time_instruction_count,15410000000,0.015
basic_modules_ListOfLinears_inductor_gpu_force_shape_pad,compile_time_instruction_count,15320000000,0.015
@ -42,24 +42,24 @@ sum_floordiv_regression,compile_time_instruction_count,1026000000,0.015
symint_sum,compile_time_instruction_count,3030000000,0.015
symint_sum,compile_time_instruction_count,3013000000,0.015
aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,1989000000,0.015
aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,1964000000,0.015
aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,5759000000,0.015
aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,5672000000,0.015
aotdispatcher_partitioner_cpu,compile_time_instruction_count,7873000000,0.015
aotdispatcher_partitioner_cpu,compile_time_instruction_count,7752000000,0.015
aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3579000000,0.015
aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3537000000,0.015
aotdispatcher_training_subclass_cpu,compile_time_instruction_count,9809000000,0.015
aotdispatcher_training_subclass_cpu,compile_time_instruction_count,9662000000,0.015

1 add_loop_eager compile_time_instruction_count 2806000000 0.015
6 basic_modules_ListOfLinears_eager compile_time_instruction_count 953800000 0.015
7 basic_modules_ListOfLinears_inductor compile_time_instruction_count 17190000000 17070000000 0.015
8 basic_modules_ListOfLinears_inductor_gpu_force_shape_pad compile_time_instruction_count 15410000000 15320000000 0.015
9 basic_modules_ListOfLinears_inductor_gpu compile_time_instruction_count 9714000000 0.2
10 update_hint_regression compile_time_instruction_count 1523000000 0.02
11 sum_floordiv_regression compile_time_instruction_count 1026000000 0.015
12 symint_sum compile_time_instruction_count 3030000000 3013000000 0.015
22
23
24
25
26
27
28
29
30
31
32
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65

View File

@ -127,7 +127,6 @@ REQUIRE_LARGER_MULTIPLIER_FOR_SMALLER_TENSOR = {
"inception_v3",
"mobilenetv3_large_100",
"cspdarknet53",
"gluon_inception_v3",
}

View File

@ -167,7 +167,7 @@ def get_inputs(
if op_name == "mm":
A = torch.randn(M, K, dtype=dtype, device=device)
B = torch.randn(N, K, dtype=dtype, device=device).t()
B = torch.randn(K, N, dtype=dtype, device=device)
C = None
return A, B, C
else:

View File

@ -296,7 +296,8 @@ class BenchmarkRunner:
(key.strip(), value.strip())
for key, value in map(lambda str: str.split(":"), key_vals) # noqa: C417
] # ['M: (32, 16)', 'ZPB: 2'] -> [('M', '(32, 16)'), ('ZPB', '2')]
out.update(key_vals)
for key, value in key_vals:
out[key] = value
return out

View File

@ -859,7 +859,6 @@ libtorch_python_core_sources = [
"torch/csrc/inductor/aoti_eager/kernel_holder.cpp",
"torch/csrc/inductor/aoti_eager/kernel_meta_info.cpp",
"torch/csrc/inductor/resize_storage_bytes.cpp",
"torch/csrc/inductor/static_cuda_launcher.cpp",
"torch/csrc/jit/backends/backend_init.cpp",
"torch/csrc/jit/python/init.cpp",
"torch/csrc/jit/passes/onnx.cpp",

View File

@ -50,7 +50,7 @@ endif()
)
if(NOT BUILD_LIBTORCHLESS)
add_library(c10 ${C10_SRCS} ${C10_HEADERS})
torch_compile_options(c10)
target_compile_options_if_supported(c10 "-Wdeprecated")
if(HAVE_SOVERSION)
set_target_properties(c10 PROPERTIES
VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})

View File

@ -76,7 +76,7 @@ inline Backend dispatchKeyToBackend(DispatchKey t) {
return Backend::VE;
} else if (t == DispatchKey::FPGA) {
return Backend::FPGA;
} else if (t == DispatchKey::MAIA || t == DispatchKey::AutogradMAIA) {
} else if (t == DispatchKey::MAIA) {
return Backend::MAIA;
} else if (t == DispatchKey::XLA || t == DispatchKey::AutogradXLA) {
return Backend::XLA;

View File

@ -32,8 +32,6 @@ const char* toString(BackendComponent t) {
return "VEBit";
case BackendComponent::MTIABit:
return "MTIA";
case BackendComponent::MAIABit:
return "MAIA";
case BackendComponent::PrivateUse1Bit:
return "PrivateUse1Bit";
case BackendComponent::PrivateUse2Bit:
@ -144,8 +142,6 @@ const char* toString(DispatchKey t) {
return "AutocastCPU";
case DispatchKey::AutocastMTIA:
return "AutocastMTIA";
case DispatchKey::AutocastMAIA:
return "AutocastMAIA";
case DispatchKey::AutocastXPU:
return "AutocastXPU";
case DispatchKey::AutocastIPU:
@ -303,7 +299,6 @@ c10::DispatchKey parseDispatchKey(const std::string& k) {
{"Tracer", c10::DispatchKey::Tracer},
{"AutocastCPU", c10::DispatchKey::AutocastCPU},
{"AutocastMTIA", c10::DispatchKey::AutocastMTIA},
{"AutocastMAIA", c10::DispatchKey::AutocastMAIA},
{"AutocastXPU", c10::DispatchKey::AutocastXPU},
{"AutocastIPU", c10::DispatchKey::AutocastIPU},
{"AutocastHPU", c10::DispatchKey::AutocastHPU},

Some files were not shown because too many files have changed in this diff Show More