mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-19 10:04:58 +08:00
Compare commits
111 Commits
ciflow/tru
...
PR-NoneBug
| Author | SHA1 | Date | |
|---|---|---|---|
| 9e4229de28 | |||
| e6180cd8ed | |||
| d2e81d9c6f | |||
| 7e274747a9 | |||
| 52ca2d7075 | |||
| f9c9b2f290 | |||
| 9b8015ccf8 | |||
| beaa796c5d | |||
| 29191cc334 | |||
| 4166d499a3 | |||
| 03b371eaeb | |||
| b390a00516 | |||
| 1d8ced9d1d | |||
| 8a6174dc2b | |||
| 65cf6d012f | |||
| 3c15220ddb | |||
| 583a007aa5 | |||
| 25eeca7eb7 | |||
| d68453d068 | |||
| 8424f6173c | |||
| 82dcfaa887 | |||
| c2aecbe5e4 | |||
| d506df5d5e | |||
| 63fa95cf2f | |||
| c8d7ea2038 | |||
| 453041e58f | |||
| 8ba468e0d6 | |||
| 2174aab940 | |||
| 94fce5beda | |||
| 1a1ec9f15f | |||
| e60d44efd6 | |||
| 1797f1c9dc | |||
| ae3bb0645d | |||
| 3efe2f4e05 | |||
| 202e398db3 | |||
| 0893b5dbd4 | |||
| 011e9bd00d | |||
| 8bcd9e543c | |||
| 108a8311d5 | |||
| 7b6905618e | |||
| b25e2b459d | |||
| ab3bf915d2 | |||
| 7f11f58108 | |||
| efd6c418cc | |||
| 63ae5e9554 | |||
| 775752512f | |||
| eeb3f6bcc8 | |||
| 3a52108033 | |||
| d37fb92fb5 | |||
| c133661f5d | |||
| ec16fd258d | |||
| c4b4c1793b | |||
| 2a80480777 | |||
| c81e2466c3 | |||
| be281e74a9 | |||
| 17f8cec511 | |||
| a3e229bec1 | |||
| cfcd399c2e | |||
| def2f4ee78 | |||
| 51fbd5873e | |||
| 52efd5d4c0 | |||
| e7a6590abb | |||
| 9988121c55 | |||
| 53c9de34f5 | |||
| f78e347c40 | |||
| 316c9d4185 | |||
| 8cfbfceded | |||
| c8008102fd | |||
| 07525763d0 | |||
| 7f69d96c71 | |||
| 01ff36e5e2 | |||
| b4f3a4c5c5 | |||
| fd779b287c | |||
| 85814f9047 | |||
| 170c622400 | |||
| 9e901b34da | |||
| 442a63a9ba | |||
| 297258a623 | |||
| 21ce69be05 | |||
| 53f25d34fd | |||
| e969f3f2d7 | |||
| c9ff5375ff | |||
| 0a40ddd336 | |||
| 1880dcfa78 | |||
| 01e43190a3 | |||
| debc2170c0 | |||
| dcd776c0ad | |||
| 91ac179599 | |||
| 356bd932d8 | |||
| 8a04f3e0ff | |||
| ba828d0fc6 | |||
| fedbbd83dd | |||
| 828536de0f | |||
| 4155456b85 | |||
| 8f3f9044a0 | |||
| 56e3472fea | |||
| 73121c8521 | |||
| 03da94b981 | |||
| c7e063c42c | |||
| 39169e2e44 | |||
| d8e5d7ddea | |||
| 5a04072af5 | |||
| a3d27ce9f2 | |||
| 715092be96 | |||
| 99d88c7c46 | |||
| 55c1b7229a | |||
| 2772c6d4aa | |||
| 8cd8e3833c | |||
| e96e531ac3 | |||
| 40e2169c40 | |||
| 12f2d34f12 |
@ -1 +1 @@
|
||||
cd1c833b079adb324871dcbbe75b43d42ffc0ade
|
||||
ca4783992ed7602a39528ba304d61f00396b2a5a
|
||||
|
||||
@ -137,6 +137,39 @@ function install_124 {
|
||||
ldconfig
|
||||
}
|
||||
|
||||
function install_126 {
|
||||
echo "Installing CUDA 12.6.2 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
|
||||
rm -rf /usr/local/cuda-12.6 /usr/local/cuda
|
||||
# install CUDA 12.6.2 in the same container
|
||||
wget -q https://developer.download.nvidia.com/compute/cuda/12.6.2/local_installers/cuda_12.6.2_560.35.03_linux.run
|
||||
chmod +x cuda_12.6.2_560.35.03_linux.run
|
||||
./cuda_12.6.2_560.35.03_linux.run --toolkit --silent
|
||||
rm -f cuda_12.6.2_560.35.03_linux.run
|
||||
rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.6 /usr/local/cuda
|
||||
|
||||
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
|
||||
mkdir tmp_cudnn && cd tmp_cudnn
|
||||
wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
|
||||
tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
|
||||
cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
|
||||
cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
|
||||
cd ..
|
||||
rm -rf tmp_cudnn
|
||||
|
||||
# NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
|
||||
# Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
|
||||
git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
|
||||
cd nccl && make -j src.build
|
||||
cp -a build/include/* /usr/local/cuda/include/
|
||||
cp -a build/lib/* /usr/local/cuda/lib64/
|
||||
cd ..
|
||||
rm -rf nccl
|
||||
|
||||
install_cusparselt_062
|
||||
|
||||
ldconfig
|
||||
}
|
||||
|
||||
function prune_118 {
|
||||
echo "Pruning CUDA 11.8 and cuDNN"
|
||||
#####################################################################################
|
||||
@ -227,12 +260,46 @@ function prune_124 {
|
||||
$NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
|
||||
|
||||
#####################################################################################
|
||||
# CUDA 12.1 prune visual tools
|
||||
# CUDA 12.4 prune visual tools
|
||||
#####################################################################################
|
||||
export CUDA_BASE="/usr/local/cuda-12.4/"
|
||||
rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
|
||||
}
|
||||
|
||||
function prune_126 {
|
||||
echo "Pruning CUDA 12.6"
|
||||
#####################################################################################
|
||||
# CUDA 12.6 prune static libs
|
||||
#####################################################################################
|
||||
export NVPRUNE="/usr/local/cuda-12.6/bin/nvprune"
|
||||
export CUDA_LIB_DIR="/usr/local/cuda-12.6/lib64"
|
||||
|
||||
export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
|
||||
export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
|
||||
|
||||
if [[ -n "$OVERRIDE_GENCODE" ]]; then
|
||||
export GENCODE=$OVERRIDE_GENCODE
|
||||
fi
|
||||
if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
|
||||
export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
|
||||
fi
|
||||
|
||||
# all CUDA libs except CuDNN and CuBLAS
|
||||
ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis" \
|
||||
| xargs -I {} bash -c \
|
||||
"echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
|
||||
|
||||
# prune CuDNN and CuBLAS
|
||||
$NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
|
||||
$NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
|
||||
|
||||
#####################################################################################
|
||||
# CUDA 12.6 prune visual tools
|
||||
#####################################################################################
|
||||
export CUDA_BASE="/usr/local/cuda-12.6/"
|
||||
rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.3.2 $CUDA_BASE/nsight-systems-2024.5.1/
|
||||
}
|
||||
|
||||
# idiomatic parameter and option handling in sh
|
||||
while test $# -gt 0
|
||||
do
|
||||
@ -243,6 +310,8 @@ do
|
||||
;;
|
||||
12.4) install_124; prune_124
|
||||
;;
|
||||
12.6) install_126; prune_126
|
||||
;;
|
||||
*) echo "bad argument $1"; exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
@ -41,13 +41,16 @@ function install_ubuntu() {
|
||||
libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
|
||||
libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
|
||||
mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
|
||||
if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
|
||||
apt-get install -y intel-ocloc
|
||||
fi
|
||||
# Development Packages
|
||||
apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
|
||||
# Install Intel Support Packages
|
||||
if [ -n "$XPU_VERSION" ]; then
|
||||
apt-get install -y intel-for-pytorch-gpu-dev-${XPU_VERSION} intel-pti-dev
|
||||
apt-get install -y intel-for-pytorch-gpu-dev-${XPU_VERSION} intel-pti-dev-0.9
|
||||
else
|
||||
apt-get install -y intel-for-pytorch-gpu-dev intel-pti-dev
|
||||
apt-get install -y intel-for-pytorch-gpu-dev-0.5 intel-pti-dev-0.9
|
||||
fi
|
||||
|
||||
# Cleanup
|
||||
@ -97,7 +100,7 @@ EOF
|
||||
intel-igc-opencl-devel level-zero-devel intel-gsc-devel libmetee-devel \
|
||||
level-zero-devel
|
||||
# Install Intel Support Packages
|
||||
yum install -y intel-for-pytorch-gpu-dev intel-pti-dev
|
||||
yum install -y intel-for-pytorch-gpu-dev-0.5 intel-pti-dev-0.9
|
||||
|
||||
# Cleanup
|
||||
dnf clean all
|
||||
@ -131,7 +134,7 @@ function install_sles() {
|
||||
zypper install -y libigdfcl-devel intel-igc-cm libigfxcmrt-devel level-zero-devel
|
||||
|
||||
# Install Intel Support Packages
|
||||
zypper install -y intel-for-pytorch-gpu-dev intel-pti-dev
|
||||
zypper install -y intel-for-pytorch-gpu-dev-0.5 intel-pti-dev-0.9
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -70,6 +70,10 @@ FROM cuda as cuda12.4
|
||||
RUN bash ./install_cuda.sh 12.4
|
||||
ENV DESIRED_CUDA=12.4
|
||||
|
||||
FROM cuda as cuda12.6
|
||||
RUN bash ./install_cuda.sh 12.6
|
||||
ENV DESIRED_CUDA=12.6
|
||||
|
||||
# Install MNIST test data
|
||||
FROM base as mnist
|
||||
ADD ./common/install_mnist.sh install_mnist.sh
|
||||
@ -79,6 +83,7 @@ FROM base as all_cuda
|
||||
COPY --from=cuda11.8 /usr/local/cuda-11.8 /usr/local/cuda-11.8
|
||||
COPY --from=cuda12.1 /usr/local/cuda-12.1 /usr/local/cuda-12.1
|
||||
COPY --from=cuda12.4 /usr/local/cuda-12.4 /usr/local/cuda-12.4
|
||||
COPY --from=cuda12.6 /usr/local/cuda-12.6 /usr/local/cuda-12.6
|
||||
|
||||
# Final step
|
||||
FROM ${BASE_TARGET} as final
|
||||
|
||||
@ -1,10 +1,12 @@
|
||||
# cf. https://github.com/pypa/manylinux/issues/53
|
||||
|
||||
import sys
|
||||
from urllib.request import urlopen
|
||||
|
||||
|
||||
GOOD_SSL = "https://google.com"
|
||||
BAD_SSL = "https://self-signed.badssl.com"
|
||||
|
||||
import sys
|
||||
|
||||
|
||||
print("Testing SSL certificate checking for Python:", sys.version)
|
||||
|
||||
@ -12,14 +14,8 @@ if sys.version_info[:2] < (2, 7) or sys.version_info[:2] < (3, 4):
|
||||
print("This version never checks SSL certs; skipping tests")
|
||||
sys.exit(0)
|
||||
|
||||
if sys.version_info[0] >= 3:
|
||||
from urllib.request import urlopen
|
||||
|
||||
EXC = OSError
|
||||
else:
|
||||
from urllib import urlopen
|
||||
|
||||
EXC = IOError
|
||||
EXC = OSError
|
||||
|
||||
print(f"Connecting to {GOOD_SSL} should work")
|
||||
urlopen(GOOD_SSL)
|
||||
|
||||
@ -5,7 +5,7 @@
|
||||
#Pinned versions: 1.6
|
||||
#test that import:
|
||||
|
||||
boto3==1.19.12
|
||||
boto3==1.35.42
|
||||
#Description: AWS SDK for python
|
||||
#Pinned versions: 1.19.12, 1.16.34
|
||||
#test that import:
|
||||
|
||||
@ -284,7 +284,7 @@ test_python_shard() {
|
||||
|
||||
# modify LD_LIBRARY_PATH to ensure it has the conda env.
|
||||
# This set of tests has been shown to be buggy without it for the split-build
|
||||
time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION
|
||||
time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
|
||||
|
||||
assert_git_not_dirty
|
||||
}
|
||||
@ -310,7 +310,8 @@ test_dynamo_shard() {
|
||||
--exclude-distributed-tests \
|
||||
--exclude-torch-export-tests \
|
||||
--shard "$1" "$NUM_TEST_SHARDS" \
|
||||
--verbose
|
||||
--verbose \
|
||||
--upload-artifacts-while-running
|
||||
assert_git_not_dirty
|
||||
}
|
||||
|
||||
@ -1354,7 +1355,7 @@ test_executorch() {
|
||||
echo "Run ExecuTorch regression tests for some models"
|
||||
# TODO(huydhn): Add more coverage here using ExecuTorch's gather models script
|
||||
# shellcheck disable=SC1091
|
||||
source .ci/scripts/test.sh mv3 cmake xnnpack-quantization-delegation ''
|
||||
source .ci/scripts/test_model.sh mv3 cmake xnnpack-quantization-delegation ''
|
||||
|
||||
popd
|
||||
|
||||
|
||||
@ -114,6 +114,12 @@ if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_B
|
||||
fi
|
||||
fi
|
||||
|
||||
USE_GLOO_WITH_OPENSSL="ON"
|
||||
if [[ "$GPU_ARCH_TYPE" =~ .*aarch64.* ]]; then
|
||||
USE_GLOO_WITH_OPENSSL="OFF"
|
||||
USE_GOLD_LINKER="OFF"
|
||||
fi
|
||||
|
||||
cat >"$envfile" <<EOL
|
||||
# =================== The following code will be executed inside Docker container ===================
|
||||
export TZ=UTC
|
||||
@ -153,7 +159,7 @@ export DOCKER_IMAGE="$DOCKER_IMAGE"
|
||||
|
||||
|
||||
export USE_GOLD_LINKER="${USE_GOLD_LINKER}"
|
||||
export USE_GLOO_WITH_OPENSSL="ON"
|
||||
export USE_GLOO_WITH_OPENSSL="${USE_GLOO_WITH_OPENSSL}"
|
||||
# =================== The above code will be executed inside Docker container ===================
|
||||
EOL
|
||||
|
||||
|
||||
@ -26,7 +26,7 @@ runs:
|
||||
retry_wait_seconds: 30
|
||||
command: |
|
||||
set -eu
|
||||
python3 -m pip install boto3==1.19.12
|
||||
python3 -m pip install boto3==1.35.42
|
||||
|
||||
- name: Download the cache
|
||||
shell: bash
|
||||
|
||||
@ -33,7 +33,7 @@ runs:
|
||||
retry_wait_seconds: 30
|
||||
command: |
|
||||
set -eu
|
||||
python3 -m pip install boto3==1.19.12
|
||||
python3 -m pip install boto3==1.35.42
|
||||
|
||||
- name: Upload the cache
|
||||
shell: bash
|
||||
|
||||
2
.github/ci_commit_pins/torchbench.txt
vendored
2
.github/ci_commit_pins/torchbench.txt
vendored
@ -1 +1 @@
|
||||
23512dbebd44a11eb84afbf53c3c071dd105297e
|
||||
e522b45cd4535b9dfe067aa68d7315755df38f48
|
||||
|
||||
251
.github/lf-scale-config.yml
vendored
251
.github/lf-scale-config.yml
vendored
@ -1,251 +0,0 @@
|
||||
|
||||
# This file is generated by .github/scripts/validate_scale_config.py in test-infra
|
||||
# It defines runner types that will be provisioned by by LF Self-hosted runners
|
||||
|
||||
# scale-config.yml:
|
||||
# Powers what instance types are available for GHA auto-scaled
|
||||
# runners. Runners listed here will be available as self hosted
|
||||
# runners, configuration is directly pulled from the main branch.
|
||||
#
|
||||
#
|
||||
# NOTES:
|
||||
# - Linux runners are by default non-ephemeral to reduce the amount of CreateInstaces calls
|
||||
# to avoid RequestLimitExceeded issues
|
||||
# - When updating this file, run the following command to validate the YAML and to generate
|
||||
# corresponding versions of scale-config for the pytorch/pytorch repo and merge the
|
||||
# pytorch/pytorch changes before merging these changes.
|
||||
# `python .github/scripts/validate_scale_config.py --test-infra-repo-root [path_to_test-infra_root] --pytorch-repo-root [path_to_pytorch_root]``
|
||||
#
|
||||
# TODO: Add some documentation on how the auto-scaling works
|
||||
#
|
||||
# NOTE: Default values,
|
||||
#
|
||||
# runner_types:
|
||||
# runner_label:
|
||||
# instance_type: m4.large
|
||||
# os: linux
|
||||
# max_available: 20
|
||||
# disk_size: 50
|
||||
# is_ephemeral: true
|
||||
|
||||
runner_types:
|
||||
lf.linux.12xlarge:
|
||||
disk_size: 200
|
||||
instance_type: c5.12xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 2000
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
lf.linux.10xlarge.avx2:
|
||||
disk_size: 200
|
||||
instance_type: m4.10xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 450
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
lf.linux.24xl.spr-metal:
|
||||
disk_size: 200
|
||||
instance_type: c7i.metal-24xl
|
||||
is_ephemeral: false
|
||||
max_available: 150
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
lf.linux.16xlarge.spr:
|
||||
disk_size: 200
|
||||
instance_type: c7i.16xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 150
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
lf.linux.9xlarge.ephemeral:
|
||||
disk_size: 200
|
||||
instance_type: c5.9xlarge
|
||||
is_ephemeral: true
|
||||
max_available: 50
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
variants:
|
||||
am2:
|
||||
ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
|
||||
lf.linux.12xlarge.ephemeral:
|
||||
disk_size: 200
|
||||
instance_type: c5.12xlarge
|
||||
is_ephemeral: true
|
||||
max_available: 300
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
lf.linux.16xlarge.nvidia.gpu:
|
||||
disk_size: 150
|
||||
instance_type: g3.16xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 150
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
lf.linux.24xlarge:
|
||||
disk_size: 150
|
||||
instance_type: c5.24xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 500
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
lf.linux.24xlarge.ephemeral:
|
||||
disk_size: 150
|
||||
instance_type: c5.24xlarge
|
||||
is_ephemeral: true
|
||||
max_available: 200
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
lf.linux.2xlarge:
|
||||
disk_size: 150
|
||||
instance_type: c5.2xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 3120
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
lf.linux.4xlarge:
|
||||
disk_size: 150
|
||||
instance_type: c5.4xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 1000
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
lf.linux.4xlarge.nvidia.gpu:
|
||||
disk_size: 150
|
||||
instance_type: g3.4xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 1000
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
lf.linux.8xlarge.nvidia.gpu:
|
||||
disk_size: 150
|
||||
instance_type: g3.8xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 400
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
lf.linux.g4dn.12xlarge.nvidia.gpu:
|
||||
disk_size: 150
|
||||
instance_type: g4dn.12xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 250
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
lf.linux.g4dn.metal.nvidia.gpu:
|
||||
disk_size: 150
|
||||
instance_type: g4dn.metal
|
||||
is_ephemeral: false
|
||||
max_available: 300
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
lf.linux.g5.48xlarge.nvidia.gpu:
|
||||
disk_size: 150
|
||||
instance_type: g5.48xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 200
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
lf.linux.g5.12xlarge.nvidia.gpu:
|
||||
disk_size: 150
|
||||
instance_type: g5.12xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 150
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
lf.linux.g5.4xlarge.nvidia.gpu:
|
||||
disk_size: 150
|
||||
instance_type: g5.4xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 2400
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
lf.linux.g6.4xlarge.experimental.nvidia.gpu:
|
||||
disk_size: 150
|
||||
instance_type: g6.4xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 50
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
lf.linux.large:
|
||||
max_available: 1200
|
||||
disk_size: 15
|
||||
instance_type: c5.large
|
||||
is_ephemeral: false
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
|
||||
lf.linux.arm64.2xlarge:
|
||||
disk_size: 256
|
||||
instance_type: t4g.2xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 200
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
|
||||
lf.linux.arm64.m7g.4xlarge:
|
||||
disk_size: 256
|
||||
instance_type: m7g.4xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 200
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
|
||||
lf.linux.arm64.2xlarge.ephemeral:
|
||||
disk_size: 256
|
||||
instance_type: t4g.2xlarge
|
||||
is_ephemeral: true
|
||||
max_available: 200
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
|
||||
lf.linux.arm64.m7g.4xlarge.ephemeral:
|
||||
disk_size: 256
|
||||
instance_type: m7g.4xlarge
|
||||
is_ephemeral: true
|
||||
max_available: 200
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
|
||||
lf.linux.arm64.m7g.metal:
|
||||
disk_size: 256
|
||||
instance_type: m7g.metal
|
||||
is_ephemeral: false
|
||||
max_available: 100
|
||||
os: linux
|
||||
ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
|
||||
lf.windows.g4dn.xlarge:
|
||||
disk_size: 256
|
||||
instance_type: g4dn.xlarge
|
||||
is_ephemeral: true
|
||||
max_available: 100
|
||||
os: windows
|
||||
lf.windows.g4dn.xlarge.nonephemeral:
|
||||
disk_size: 256
|
||||
instance_type: g4dn.xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 100
|
||||
os: windows
|
||||
lf.windows.4xlarge:
|
||||
disk_size: 256
|
||||
instance_type: c5d.4xlarge
|
||||
is_ephemeral: true
|
||||
max_available: 420
|
||||
os: windows
|
||||
lf.windows.4xlarge.nonephemeral:
|
||||
disk_size: 256
|
||||
instance_type: c5d.4xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 420
|
||||
os: windows
|
||||
lf.windows.8xlarge.nvidia.gpu:
|
||||
disk_size: 256
|
||||
instance_type: p3.2xlarge
|
||||
is_ephemeral: true
|
||||
max_available: 300
|
||||
os: windows
|
||||
lf.windows.8xlarge.nvidia.gpu.nonephemeral:
|
||||
disk_size: 256
|
||||
instance_type: p3.2xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 300
|
||||
os: windows
|
||||
lf.windows.g5.4xlarge.nvidia.gpu:
|
||||
disk_size: 256
|
||||
instance_type: g5.4xlarge
|
||||
is_ephemeral: false
|
||||
max_available: 250
|
||||
os: windows
|
||||
2
.github/requirements-gha-cache.txt
vendored
2
.github/requirements-gha-cache.txt
vendored
@ -4,7 +4,7 @@
|
||||
# docs/cpp/requirements.txt
|
||||
# functorch/docs/requirements.txt
|
||||
# .ci/docker/requirements-ci.txt
|
||||
boto3==1.19.12
|
||||
boto3==1.35.42
|
||||
jinja2==3.1.4
|
||||
lintrunner==0.10.7
|
||||
ninja==1.10.0.post1
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
boto3==1.19.12
|
||||
boto3==1.35.42
|
||||
hypothesis==6.56.4
|
||||
expecttest==0.2.1
|
||||
fbscribelogger==0.1.6
|
||||
|
||||
@ -459,7 +459,7 @@ def generate_wheels_matrix(
|
||||
".", "_"
|
||||
),
|
||||
"pytorch_extra_install_requirements": (
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS["12.1"]
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS["12.4"]
|
||||
if os != "linux" and gpu_arch_type != "xpu"
|
||||
else ""
|
||||
),
|
||||
|
||||
60
.github/scripts/generate_ci_workflows.py
vendored
60
.github/scripts/generate_ci_workflows.py
vendored
@ -114,20 +114,21 @@ LINUX_BINARY_BUILD_WORFKLOWS = [
|
||||
isolated_workflow=True,
|
||||
),
|
||||
),
|
||||
BinaryBuildWorkflow(
|
||||
os=OperatingSystem.LINUX,
|
||||
package_type="manywheel",
|
||||
build_configs=generate_binary_build_matrix.generate_wheels_matrix(
|
||||
OperatingSystem.LINUX,
|
||||
use_split_build=True,
|
||||
arches=["11.8", "12.1", "12.4", "cpu"],
|
||||
),
|
||||
ciflow_config=CIFlowConfig(
|
||||
labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
|
||||
isolated_workflow=True,
|
||||
),
|
||||
use_split_build=True,
|
||||
),
|
||||
# See https://github.com/pytorch/pytorch/issues/138750
|
||||
# BinaryBuildWorkflow(
|
||||
# os=OperatingSystem.LINUX,
|
||||
# package_type="manywheel",
|
||||
# build_configs=generate_binary_build_matrix.generate_wheels_matrix(
|
||||
# OperatingSystem.LINUX,
|
||||
# use_split_build=True,
|
||||
# arches=["11.8", "12.1", "12.4", "cpu"],
|
||||
# ),
|
||||
# ciflow_config=CIFlowConfig(
|
||||
# labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
|
||||
# isolated_workflow=True,
|
||||
# ),
|
||||
# use_split_build=True,
|
||||
# ),
|
||||
BinaryBuildWorkflow(
|
||||
os=OperatingSystem.LINUX,
|
||||
package_type="conda",
|
||||
@ -180,21 +181,22 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
|
||||
),
|
||||
branches="main",
|
||||
),
|
||||
BinaryBuildWorkflow(
|
||||
os=OperatingSystem.LINUX,
|
||||
package_type="manywheel",
|
||||
build_configs=generate_binary_build_matrix.generate_wheels_matrix(
|
||||
OperatingSystem.LINUX,
|
||||
arches=["11.8", "12.1", "12.4"],
|
||||
python_versions=["3.9"],
|
||||
use_split_build=True,
|
||||
),
|
||||
ciflow_config=CIFlowConfig(
|
||||
labels={LABEL_CIFLOW_PERIODIC},
|
||||
),
|
||||
branches="main",
|
||||
use_split_build=True,
|
||||
),
|
||||
# See https://github.com/pytorch/pytorch/issues/138750
|
||||
# BinaryBuildWorkflow(
|
||||
# os=OperatingSystem.LINUX,
|
||||
# package_type="manywheel",
|
||||
# build_configs=generate_binary_build_matrix.generate_wheels_matrix(
|
||||
# OperatingSystem.LINUX,
|
||||
# arches=["11.8", "12.1", "12.4"],
|
||||
# python_versions=["3.9"],
|
||||
# use_split_build=True,
|
||||
# ),
|
||||
# ciflow_config=CIFlowConfig(
|
||||
# labels={LABEL_CIFLOW_PERIODIC},
|
||||
# ),
|
||||
# branches="main",
|
||||
# use_split_build=True,
|
||||
# ),
|
||||
BinaryBuildWorkflow(
|
||||
os=OperatingSystem.LINUX,
|
||||
package_type="libtorch",
|
||||
|
||||
2
.github/workflows/_binary-build-linux.yml
vendored
2
.github/workflows/_binary-build-linux.yml
vendored
@ -271,7 +271,7 @@ jobs:
|
||||
)
|
||||
docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
|
||||
if [[ ${BUILD_ENVIRONMENT} == *"aarch64"* ]]; then
|
||||
docker exec -t "${container_name}" bash -c "bash /builder/aarch64_linux/aarch64_ci_build.sh"
|
||||
docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/aarch64_linux/aarch64_ci_build.sh"
|
||||
elif [[ ${{ inputs.PACKAGE_TYPE }} == "manywheel" || ${{ inputs.PACKAGE_TYPE }} == "libtorch" ]]; then
|
||||
docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/${{ inputs.PACKAGE_TYPE }}/build.sh"
|
||||
else
|
||||
|
||||
3
.github/workflows/_linux-test.yml
vendored
3
.github/workflows/_linux-test.yml
vendored
@ -230,7 +230,7 @@ jobs:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
|
||||
IS_A100_RUNNER: ${{ contains(matrix.runner, 'a100') && '1' || '0' }}
|
||||
|
||||
ARTIFACTS_FILE_SUFFIX: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
|
||||
run: |
|
||||
set -x
|
||||
|
||||
@ -289,6 +289,7 @@ jobs:
|
||||
-e SCRIBE_GRAPHQL_ACCESS_TOKEN \
|
||||
-e DASHBOARD_TAG \
|
||||
-e IS_A100_RUNNER \
|
||||
-e ARTIFACTS_FILE_SUFFIX \
|
||||
--env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
|
||||
--security-opt seccomp=unconfined \
|
||||
--cap-add=SYS_PTRACE \
|
||||
|
||||
2
.github/workflows/_runner-determinator.yml
vendored
2
.github/workflows/_runner-determinator.yml
vendored
@ -40,6 +40,8 @@ on:
|
||||
|
||||
jobs:
|
||||
runner-determinator:
|
||||
# Don't run on forked repos
|
||||
if: github.repository_owner == 'pytorch'
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
label-type: ${{ steps.set-condition.outputs.label-type }}
|
||||
|
||||
2
.github/workflows/build-conda-images.yml
vendored
2
.github/workflows/build-conda-images.yml
vendored
@ -35,7 +35,7 @@ jobs:
|
||||
runs-on: linux.9xlarge.ephemeral
|
||||
strategy:
|
||||
matrix:
|
||||
cuda_version: ["11.8", "12.1", "12.4", "cpu"]
|
||||
cuda_version: ["11.8", "12.1", "12.4", "12.6", "cpu"]
|
||||
env:
|
||||
CUDA_VERSION: ${{ matrix.cuda_version }}
|
||||
steps:
|
||||
|
||||
8
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
8
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
@ -65,7 +65,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_9-cpu-aarch64
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cpu-aarch64-test: # Testing
|
||||
@ -185,7 +185,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cpu-aarch64
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cpu-aarch64-test: # Testing
|
||||
@ -305,7 +305,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cpu-aarch64
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cpu-aarch64-test: # Testing
|
||||
@ -425,7 +425,7 @@ jobs:
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cpu-aarch64
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cpu-aarch64-test: # Testing
|
||||
|
||||
182
.github/workflows/generated-linux-binary-manywheel-split-main.yml
generated
vendored
182
.github/workflows/generated-linux-binary-manywheel-split-main.yml
generated
vendored
@ -1,182 +0,0 @@
|
||||
# @generated DO NOT EDIT MANUALLY
|
||||
|
||||
# Template is at: .github/templates/linux_binary_build_workflow.yml.j2
|
||||
# Generation script: .github/scripts/generate_ci_workflows.py
|
||||
name: linux-binary-manywheel-split
|
||||
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
tags:
|
||||
- 'ciflow/periodic/*'
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
# Needed for conda builds
|
||||
ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
|
||||
ANACONDA_USER: pytorch
|
||||
AWS_DEFAULT_REGION: us-east-1
|
||||
BINARY_ENV_FILE: /tmp/env
|
||||
BUILD_ENVIRONMENT: linux-binary-manywheel-split
|
||||
BUILDER_ROOT: /builder
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
PYTORCH_FINAL_PACKAGE_DIR: /artifacts
|
||||
PYTORCH_ROOT: /pytorch
|
||||
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
SKIP_ALL_TESTS: 0
|
||||
concurrency:
|
||||
group: linux-binary-manywheel-split-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
manywheel-py3_9-cuda11_8-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu118
|
||||
GPU_ARCH_VERSION: 11.8
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda11_8
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda11_8-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_9-cuda11_8-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu118
|
||||
GPU_ARCH_VERSION: 11.8
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda11_8
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
manywheel-py3_9-cuda12_1-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_1
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda12_1-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_9-cuda12_1-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu121
|
||||
GPU_ARCH_VERSION: 12.1
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda12_1
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
manywheel-py3_9-cuda12_4-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu124
|
||||
GPU_ARCH_VERSION: 12.4
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_9-cuda12_4
|
||||
build_environment: linux-binary-manywheel-split
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cuda12_4-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_9-cuda12_4-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
BUILDER_ROOT: /builder
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu124
|
||||
GPU_ARCH_VERSION: 12.4
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
|
||||
use_split_build: True
|
||||
DESIRED_PYTHON: "3.9"
|
||||
build_name: manywheel-py3_9-cuda12_4
|
||||
build_environment: linux-binary-manywheel-split
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.4xlarge.nvidia.gpu
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
1796
.github/workflows/generated-linux-binary-manywheel-split-nightly.yml
generated
vendored
1796
.github/workflows/generated-linux-binary-manywheel-split-nightly.yml
generated
vendored
File diff suppressed because it is too large
Load Diff
10
.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
generated
vendored
10
.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
generated
vendored
@ -64,7 +64,7 @@ jobs:
|
||||
ALPINE_IMAGE: "docker.io/s390x/alpine"
|
||||
build_name: manywheel-py3_9-cpu-s390x
|
||||
build_environment: linux-s390x-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_9-cpu-s390x-test: # Testing
|
||||
@ -133,7 +133,7 @@ jobs:
|
||||
ALPINE_IMAGE: "docker.io/s390x/alpine"
|
||||
build_name: manywheel-py3_10-cpu-s390x
|
||||
build_environment: linux-s390x-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cpu-s390x-test: # Testing
|
||||
@ -202,7 +202,7 @@ jobs:
|
||||
ALPINE_IMAGE: "docker.io/s390x/alpine"
|
||||
build_name: manywheel-py3_11-cpu-s390x
|
||||
build_environment: linux-s390x-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cpu-s390x-test: # Testing
|
||||
@ -271,7 +271,7 @@ jobs:
|
||||
ALPINE_IMAGE: "docker.io/s390x/alpine"
|
||||
build_name: manywheel-py3_12-cpu-s390x
|
||||
build_environment: linux-s390x-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cpu-s390x-test: # Testing
|
||||
@ -340,7 +340,7 @@ jobs:
|
||||
ALPINE_IMAGE: "docker.io/s390x/alpine"
|
||||
build_name: manywheel-py3_13-cpu-s390x
|
||||
build_environment: linux-s390x-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cpu-s390x-test: # Testing
|
||||
|
||||
10
.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
generated
vendored
10
.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
generated
vendored
@ -46,7 +46,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu
|
||||
SKIP_ALL_TESTS: 1
|
||||
DESIRED_PYTHON: "3.9"
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
steps:
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
@ -162,7 +162,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu
|
||||
SKIP_ALL_TESTS: 1
|
||||
DESIRED_PYTHON: "3.10"
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
steps:
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
@ -278,7 +278,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu
|
||||
SKIP_ALL_TESTS: 1
|
||||
DESIRED_PYTHON: "3.11"
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
steps:
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
@ -394,7 +394,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu
|
||||
SKIP_ALL_TESTS: 1
|
||||
DESIRED_PYTHON: "3.12"
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
steps:
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
@ -510,7 +510,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu
|
||||
SKIP_ALL_TESTS: 1
|
||||
DESIRED_PYTHON: "3.13"
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
steps:
|
||||
# NOTE: These environment variables are put here so that they can be applied on every job equally
|
||||
# They are also here because setting them at a workflow level doesn't give us access to the
|
||||
|
||||
32
.github/workflows/generated-windows-binary-wheel-nightly.yml
generated
vendored
32
.github/workflows/generated-windows-binary-wheel-nightly.yml
generated
vendored
@ -55,7 +55,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu
|
||||
SKIP_ALL_TESTS: 1
|
||||
DESIRED_PYTHON: "3.9"
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -322,7 +322,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda
|
||||
SKIP_ALL_TESTS: 1
|
||||
DESIRED_PYTHON: "3.9"
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -591,7 +591,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda
|
||||
SKIP_ALL_TESTS: 1
|
||||
DESIRED_PYTHON: "3.9"
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -860,7 +860,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda
|
||||
SKIP_ALL_TESTS: 1
|
||||
DESIRED_PYTHON: "3.9"
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -1393,7 +1393,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu
|
||||
SKIP_ALL_TESTS: 1
|
||||
DESIRED_PYTHON: "3.10"
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -1660,7 +1660,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda
|
||||
SKIP_ALL_TESTS: 1
|
||||
DESIRED_PYTHON: "3.10"
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -1929,7 +1929,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda
|
||||
SKIP_ALL_TESTS: 1
|
||||
DESIRED_PYTHON: "3.10"
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -2198,7 +2198,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda
|
||||
SKIP_ALL_TESTS: 1
|
||||
DESIRED_PYTHON: "3.10"
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -2731,7 +2731,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu
|
||||
SKIP_ALL_TESTS: 1
|
||||
DESIRED_PYTHON: "3.11"
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -2998,7 +2998,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda
|
||||
SKIP_ALL_TESTS: 1
|
||||
DESIRED_PYTHON: "3.11"
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -3267,7 +3267,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda
|
||||
SKIP_ALL_TESTS: 1
|
||||
DESIRED_PYTHON: "3.11"
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -3536,7 +3536,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda
|
||||
SKIP_ALL_TESTS: 1
|
||||
DESIRED_PYTHON: "3.11"
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -4069,7 +4069,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cpu
|
||||
SKIP_ALL_TESTS: 1
|
||||
DESIRED_PYTHON: "3.12"
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -4336,7 +4336,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda
|
||||
SKIP_ALL_TESTS: 1
|
||||
DESIRED_PYTHON: "3.12"
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -4605,7 +4605,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda
|
||||
SKIP_ALL_TESTS: 1
|
||||
DESIRED_PYTHON: "3.12"
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
@ -4874,7 +4874,7 @@ jobs:
|
||||
GPU_ARCH_TYPE: cuda
|
||||
SKIP_ALL_TESTS: 1
|
||||
DESIRED_PYTHON: "3.12"
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
|
||||
steps:
|
||||
- name: Display EC2 information
|
||||
shell: bash
|
||||
|
||||
28
.github/workflows/inductor-perf-compare.yml
vendored
28
.github/workflows/inductor-perf-compare.yml
vendored
@ -13,8 +13,8 @@ concurrency:
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
get-default-label-prefix:
|
||||
name: get-default-label-prefix
|
||||
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
@ -22,21 +22,33 @@ jobs:
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
get-test-label-type:
|
||||
name: get-test-label-type
|
||||
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
|
||||
curr_branch: ${{ github.head_ref || github.ref_name }}
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
check_experiments: "awsa100"
|
||||
|
||||
linux-focal-cuda12_1-py3_10-gcc9-inductor-build:
|
||||
name: cuda12.1-py3.10-gcc9-sm80
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
needs:
|
||||
- get-default-label-prefix
|
||||
- get-test-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
||||
build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
|
||||
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.0'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "inductor_huggingface_perf_compare", shard: 1, num_shards: 1, runner: "linux.gcp.a100" },
|
||||
{ config: "inductor_timm_perf_compare", shard: 1, num_shards: 2, runner: "linux.gcp.a100" },
|
||||
{ config: "inductor_timm_perf_compare", shard: 2, num_shards: 2, runner: "linux.gcp.a100" },
|
||||
{ config: "inductor_torchbench_perf_compare", shard: 1, num_shards: 1, runner: "linux.gcp.a100" },
|
||||
{ config: "inductor_huggingface_perf_compare", shard: 1, num_shards: 1, runner: "${{ needs.get-test-label-type.outputs.label-type }}linux.gcp.a100" },
|
||||
{ config: "inductor_timm_perf_compare", shard: 1, num_shards: 2, runner: "${{ needs.get-test-label-type.outputs.label-type }}linux.gcp.a100" },
|
||||
{ config: "inductor_timm_perf_compare", shard: 2, num_shards: 2, runner: "${{ needs.get-test-label-type.outputs.label-type }}linux.gcp.a100" },
|
||||
{ config: "inductor_torchbench_perf_compare", shard: 1, num_shards: 1, runner: "${{ needs.get-test-label-type.outputs.label-type }}linux.gcp.a100" },
|
||||
]}
|
||||
secrets:
|
||||
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
|
||||
|
||||
5
.github/workflows/llm_td_retrieval.yml
vendored
5
.github/workflows/llm_td_retrieval.yml
vendored
@ -8,9 +8,10 @@ permissions:
|
||||
contents: read
|
||||
|
||||
jobs:
|
||||
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
# Don't run on forked repos
|
||||
if: github.repository_owner == 'pytorch'
|
||||
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
@ -19,6 +20,8 @@ jobs:
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
llm-retrieval:
|
||||
# Don't run on forked repos
|
||||
if: github.repository_owner == 'pytorch'
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
|
||||
continue-on-error: true
|
||||
needs: get-label-type
|
||||
|
||||
@ -32,7 +32,7 @@ jobs:
|
||||
cache: pip
|
||||
|
||||
- run: |
|
||||
pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12
|
||||
pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.35.42
|
||||
|
||||
- name: Upload external contribution stats
|
||||
uses: nick-fields/retry@v3.0.0
|
||||
|
||||
3
.github/workflows/periodic.yml
vendored
3
.github/workflows/periodic.yml
vendored
@ -333,6 +333,7 @@ jobs:
|
||||
name: linux-focal-cuda12.1-py3.10-gcc9-experimental-split-build
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
if: false # See https://github.com/pytorch/pytorch/issues/138750
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
use_split_build: true
|
||||
@ -363,6 +364,7 @@ jobs:
|
||||
name: linux-focal-cuda11.8-py3.9-gcc9-experimental-split-build
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
if: false # See https://github.com/pytorch/pytorch/issues/138750
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
use_split_build: true
|
||||
@ -390,6 +392,7 @@ jobs:
|
||||
name: linux-focal-cuda11.8-py3.10-gcc9-experimental-split-build
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
if: false # See https://github.com/pytorch/pytorch/issues/138750
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
use_split_build: true
|
||||
|
||||
1
.github/workflows/pull.yml
vendored
1
.github/workflows/pull.yml
vendored
@ -579,6 +579,7 @@ jobs:
|
||||
secrets: inherit
|
||||
|
||||
linux-focal-py3_12-clang10-experimental-split-build:
|
||||
if: false # See https://github.com/pytorch/pytorch/issues/138750
|
||||
name: linux-focal-py3.12-clang10-experimental-split-build
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
|
||||
4
.github/workflows/target_determination.yml
vendored
4
.github/workflows/target_determination.yml
vendored
@ -7,6 +7,8 @@ jobs:
|
||||
|
||||
get-label-type:
|
||||
name: get-label-type
|
||||
# Don't run on forked repos
|
||||
if: github.repository_owner == 'pytorch'
|
||||
uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
|
||||
with:
|
||||
triggering_actor: ${{ github.triggering_actor }}
|
||||
@ -70,7 +72,7 @@ jobs:
|
||||
PR_NUMBER: ${{ github.event.pull_request.number }}
|
||||
run: |
|
||||
unzip -o .additional_ci_files/llm_results/mappings.zip -d .additional_ci_files/llm_results || true
|
||||
python3 -m pip install boto3==1.19.12
|
||||
python3 -m pip install boto3==1.35.42
|
||||
python3 tools/testing/do_target_determination_for_s3.py
|
||||
|
||||
- name: Upload TD results to s3
|
||||
|
||||
1
.github/workflows/trunk.yml
vendored
1
.github/workflows/trunk.yml
vendored
@ -256,6 +256,7 @@ jobs:
|
||||
tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
|
||||
|
||||
linux-focal-cuda12_4-py3_10-gcc9-experimental-split-build:
|
||||
if: false # See https://github.com/pytorch/pytorch/issues/138750
|
||||
name: linux-focal-cuda12.4-py3.10-gcc9-experimental-split-build
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
|
||||
2
.github/workflows/update_pytorch_labels.yml
vendored
2
.github/workflows/update_pytorch_labels.yml
vendored
@ -29,5 +29,5 @@ jobs:
|
||||
aws-region: us-east-1
|
||||
- name: Update PyTorch labels list in S3
|
||||
run: |
|
||||
python3 -m pip install boto3==1.19.12
|
||||
python3 -m pip install boto3==1.35.42
|
||||
.github/scripts/export_pytorch_labels.py pytorch pytorch
|
||||
|
||||
2
.github/workflows/upload-test-stats.yml
vendored
2
.github/workflows/upload-test-stats.yml
vendored
@ -53,7 +53,7 @@ jobs:
|
||||
cache: pip
|
||||
|
||||
- run: |
|
||||
pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12
|
||||
pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.35.42
|
||||
|
||||
- name: Upload test artifacts
|
||||
id: upload-s3
|
||||
|
||||
@ -49,7 +49,7 @@ jobs:
|
||||
cache: pip
|
||||
|
||||
- run: |
|
||||
pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12
|
||||
pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.35.42
|
||||
|
||||
- name: Upload torch dynamo performance stats to S3
|
||||
id: upload-s3
|
||||
|
||||
@ -28,7 +28,7 @@ jobs:
|
||||
cache: pip
|
||||
|
||||
- run: |
|
||||
pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.19.12
|
||||
pip3 install requests==2.32.2 rockset==1.0.3 boto3==1.35.42
|
||||
|
||||
- name: Upload test stats
|
||||
env:
|
||||
|
||||
@ -1402,7 +1402,7 @@ init_command = [
|
||||
'black==23.12.1',
|
||||
'usort==1.0.8.post1',
|
||||
'isort==5.13.2',
|
||||
'ruff==0.6.3', # sync with RUFF
|
||||
'ruff==0.7.0', # sync with RUFF
|
||||
]
|
||||
is_formatter = true
|
||||
|
||||
@ -1487,7 +1487,7 @@ init_command = [
|
||||
'python3',
|
||||
'tools/linter/adapters/pip_init.py',
|
||||
'--dry-run={{DRYRUN}}',
|
||||
'ruff==0.6.3', # sync with PYFMT
|
||||
'ruff==0.7.0', # sync with PYFMT
|
||||
]
|
||||
is_formatter = true
|
||||
|
||||
|
||||
@ -878,7 +878,7 @@ Process 87741 stopped
|
||||
* thread #1, queue = 'com.apple.main-thread', stop reason = breakpoint 1.1
|
||||
frame #0: 0x00000001024e2628 libtorch_python.dylib`at::indexing::impl::applySelect(self=0x00000001004ee8a8, dim=0, index=(data_ = 3), real_dim=0, (null)=0x000000016fdfe535, self_sizes= Has Value=true ) at TensorIndexing.h:239:7
|
||||
236 const at::Device& /*self_device*/,
|
||||
237 const c10::optional<SymIntArrayRef>& self_sizes) {
|
||||
237 const std::optional<SymIntArrayRef>& self_sizes) {
|
||||
238 // See NOTE [nested tensor size for indexing]
|
||||
-> 239 if (self_sizes.has_value()) {
|
||||
240 auto maybe_index = index.maybe_as_int();
|
||||
@ -1081,10 +1081,6 @@ Here are a few well known pitfalls and workarounds:
|
||||
catch all of these problems: stay vigilant to the possibility that
|
||||
your crash is due to a real memory problem.
|
||||
|
||||
* (NVCC) `c10::optional` does not work when used from device code. Don't use
|
||||
it from kernels. Upstream issue: https://github.com/akrzemi1/Optional/issues/58
|
||||
and our local issue #10329.
|
||||
|
||||
* `constexpr` generally works less well on MSVC.
|
||||
|
||||
* The idiom `static_assert(f() == f())` to test if `f` is constexpr
|
||||
|
||||
@ -43,9 +43,19 @@ class TORCH_API Context {
|
||||
|
||||
if (device_type == at::kCPU) {
|
||||
return at::detail::getDefaultCPUGenerator();
|
||||
} else if (device_type == at::kCUDA) {
|
||||
return at::detail::getCUDAHooks().getDefaultCUDAGenerator(device.index());
|
||||
} else if (device_type == at::kMPS) {
|
||||
return at::detail::getMPSHooks().getDefaultMPSGenerator();
|
||||
} else if (device_type == at::kXPU) {
|
||||
return at::detail::getXPUHooks().getDefaultXPUGenerator(device.index());
|
||||
} else if (device_type == at::kIPU) {
|
||||
return at::detail::getIPUHooks().getDefaultIPUGenerator(device.index());
|
||||
} else if (device_type == at::kPrivateUse1) {
|
||||
return at::detail::getPrivateUse1Hooks().getDefaultGenerator(
|
||||
device.index());
|
||||
} else {
|
||||
return getAcceleratorHooksInterface(device_type)
|
||||
.getDefaultGenerator(device.index());
|
||||
AT_ERROR(c10::DeviceTypeName(device_type), " device type not enabled.");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -38,11 +38,9 @@ inline constexpr bool should_include_kernel_dtype(
|
||||
* binary.
|
||||
*/
|
||||
#if defined ENABLE_RECORD_KERNEL_FUNCTION_DTYPE
|
||||
namespace at {
|
||||
namespace detail {
|
||||
namespace at::detail {
|
||||
TORCH_API void record_kernel_function_dtype(std::string name);
|
||||
}
|
||||
} // namespace at
|
||||
} // namespace at::detail
|
||||
|
||||
#define RECORD_KERNEL_FUNCTION_DTYPE(NAME, enum_type) \
|
||||
at::detail::record_kernel_function_dtype( \
|
||||
|
||||
@ -112,6 +112,10 @@ class TORCH_API RefcountedMapAllocator : private RefcountedMapAllocatorArgCheck,
|
||||
size_t size);
|
||||
|
||||
static RefcountedMapAllocator* fromDataPtr(const at::DataPtr&);
|
||||
RefcountedMapAllocator(const RefcountedMapAllocator&) = delete;
|
||||
RefcountedMapAllocator(RefcountedMapAllocator&&) = delete;
|
||||
RefcountedMapAllocator& operator=(const RefcountedMapAllocator&) = delete;
|
||||
RefcountedMapAllocator& operator=(RefcountedMapAllocator&&) = delete;
|
||||
static at::DataPtr makeDataPtr(
|
||||
const char* filename,
|
||||
int flags,
|
||||
|
||||
@ -61,7 +61,7 @@ MemOverlapStatus get_overlap_status(const TensorImpl* a, const TensorImpl* b) {
|
||||
// same pointer across multiple storages there are many
|
||||
// similar situations (e.g., storage().data() == storage().data()+1)
|
||||
// which we will miss.
|
||||
auto a_storage = a->unsafe_storage();
|
||||
const auto& a_storage = a->unsafe_storage();
|
||||
if (a_storage && a_storage.is_alias_of(b->unsafe_storage())) {
|
||||
const auto a_begin = static_cast<const char*>(a->data());
|
||||
const auto a_end = a_begin + a->numel() * a->itemsize();
|
||||
|
||||
@ -8,6 +8,17 @@
|
||||
|
||||
namespace c10 {
|
||||
|
||||
namespace detail {
|
||||
template <typename Base, typename Child, typename... Args>
|
||||
std::enable_if_t<
|
||||
!std::is_array_v<Base> && !std::is_array_v<Child> &&
|
||||
std::is_base_of_v<Base, Child>,
|
||||
std::unique_ptr<Base>>
|
||||
make_unique_base(Args&&... args) {
|
||||
return std::unique_ptr<Base>(new Child(std::forward<Args>(args)...));
|
||||
}
|
||||
}
|
||||
|
||||
inline KernelFunction::KernelFunction()
|
||||
: boxed_kernel_func_()
|
||||
, unboxed_kernel_func_(nullptr)
|
||||
@ -183,7 +194,7 @@ inline KernelFunction KernelFunction::makeFromUnboxedFunction(FuncPtr func_ptr)
|
||||
#if !defined(C10_MOBILE)
|
||||
(void)func_ptr; // Suppress unused variable warning
|
||||
return makeFromUnboxedFunctor<AllowLegacyTypes, typename impl::WrapFunctionIntoFunctor<FuncPtr>::type>(
|
||||
guts::make_unique_base<OperatorKernel, typename impl::WrapFunctionIntoFunctor<FuncPtr>::type>()
|
||||
detail::make_unique_base<OperatorKernel, typename impl::WrapFunctionIntoFunctor<FuncPtr>::type>()
|
||||
);
|
||||
#else
|
||||
// On mobile, we rather want to optimize for binary size than for performance,
|
||||
@ -200,7 +211,7 @@ inline KernelFunction KernelFunction::makeFromUnboxedRuntimeFunction(FuncType* f
|
||||
TORCH_INTERNAL_ASSERT(func != nullptr, "Kernel function cannot be nullptr");
|
||||
|
||||
return makeFromUnboxedFunctor<AllowLegacyTypes, impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<FuncType>>>(
|
||||
guts::make_unique_base<OperatorKernel, impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<FuncType>>>(func)
|
||||
detail::make_unique_base<OperatorKernel, impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<FuncType>>>(func)
|
||||
);
|
||||
}
|
||||
|
||||
@ -210,7 +221,7 @@ inline std::enable_if_t<guts::is_stateless_lambda<std::decay_t<Lambda>>::value,
|
||||
|
||||
#if !defined(C10_MOBILE)
|
||||
return makeFromUnboxedFunctor<AllowLegacyTypes, impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<Lambda>>>(
|
||||
guts::make_unique_base<OperatorKernel, impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<Lambda>>>(std::forward<Lambda>(lambda))
|
||||
detail::make_unique_base<OperatorKernel, impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<Lambda>>>(std::forward<Lambda>(lambda))
|
||||
);
|
||||
#else
|
||||
// On mobile, we rather want to optimize for binary size than for performance,
|
||||
@ -226,7 +237,7 @@ inline std::enable_if_t<!guts::is_stateless_lambda<std::decay_t<Lambda>>::value,
|
||||
static_assert(guts::is_functor<std::decay_t<Lambda>>::value, "Tried to call KernelFunction::makeFromUnboxedLambda with a non-lambda type.");
|
||||
|
||||
return makeFromUnboxedFunctor<AllowLegacyTypes, impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<Lambda>>>(
|
||||
guts::make_unique_base<OperatorKernel, impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<Lambda>>>(std::forward<Lambda>(lambda))
|
||||
detail::make_unique_base<OperatorKernel, impl::WrapFunctionIntoRuntimeFunctor<std::decay_t<Lambda>>>(std::forward<Lambda>(lambda))
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@ -3,6 +3,7 @@
|
||||
#include <condition_variable>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <tuple>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
|
||||
|
||||
@ -295,6 +295,19 @@ public:
|
||||
}
|
||||
return false;
|
||||
}
|
||||
// TODO: Remove this once the issue with MSVC is fixed
|
||||
// See https://developercommunity.visualstudio.com/t/MSVC-loop-unrolling-problem-194033813-/10720692
|
||||
#if defined(_WIN32) && defined(__aarch64__)
|
||||
Vectorized<T> map(T (*const f)(T)) const {
|
||||
Vectorized<T> ret;
|
||||
for (int64_t i = 0; i < size(); i++) {
|
||||
ret[i] = f(values[i]);
|
||||
if (++i < size())
|
||||
ret[i] = f(values[i]);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
#else
|
||||
Vectorized<T> map(T (*const f)(T)) const {
|
||||
Vectorized<T> ret;
|
||||
for (int64_t i = 0; i != size(); i++) {
|
||||
@ -302,6 +315,7 @@ public:
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
Vectorized<T> map(T (*const f)(const T &)) const {
|
||||
Vectorized<T> ret;
|
||||
for (int64_t i = 0; i != size(); i++) {
|
||||
|
||||
@ -34,7 +34,7 @@ public:
|
||||
|
||||
private:
|
||||
cublasHandle_t handle;
|
||||
cublasPointerMode_t previous_mode;
|
||||
cublasPointerMode_t previous_mode{};
|
||||
};
|
||||
|
||||
/* LEVEL 3 BLAS FUNCTIONS */
|
||||
|
||||
@ -31,7 +31,7 @@ static std::vector<Generator> default_gens_cuda;
|
||||
* Warning: this function must only be called once!
|
||||
*/
|
||||
static void initCUDAGenVector() {
|
||||
num_gpus = c10::cuda::device_count();
|
||||
num_gpus = static_cast<int32_t>(c10::cuda::device_count());
|
||||
cuda_gens_init_flag.resize(num_gpus);
|
||||
default_gens_cuda.resize(num_gpus);
|
||||
}
|
||||
|
||||
@ -5,7 +5,6 @@
|
||||
#include <ATen/core/TensorBase.h>
|
||||
#include <ATen/cuda/PhiloxCudaState.h>
|
||||
#include <atomic>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <unordered_set>
|
||||
namespace at {
|
||||
@ -168,7 +167,7 @@ struct TORCH_CUDA_CPP_API CUDAGeneratorImpl : public c10::GeneratorImpl {
|
||||
CUDAGeneratorImpl* clone_impl() const override;
|
||||
|
||||
c10::intrusive_ptr<CUDAGeneratorState> state_;
|
||||
std::atomic_flag no_reset_rnn_state_;
|
||||
std::atomic_flag no_reset_rnn_state_{};
|
||||
};
|
||||
|
||||
namespace cuda::detail {
|
||||
|
||||
@ -7,9 +7,7 @@
|
||||
|
||||
#include <chrono>
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
namespace at::cuda {
|
||||
|
||||
@ -19,8 +17,7 @@ constexpr int kSynchronizeBusyWaitMillis = 10;
|
||||
MempoolId_t graph_pool_handle() {
|
||||
// Sets just the second value, to distinguish it from MempoolId_ts created from
|
||||
// cudaStreamGetCaptureInfo id_s in capture_begin.
|
||||
auto new_pool = c10::cuda::MemPool();
|
||||
return new_pool.id();
|
||||
return c10::cuda::MemPool::graph_pool_handle();
|
||||
}
|
||||
|
||||
/**
|
||||
@ -115,8 +112,7 @@ void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/, cudaStreamCaptureMode capt
|
||||
} else {
|
||||
// User did not ask us to share a mempool. Create graph pool handle using is_user_created=false.
|
||||
// Sets just the first value, to distinguish it from MempoolId_ts created by graph_pool_handle().
|
||||
auto mempool = c10::cuda::MemPool({}, false);
|
||||
mempool_id_ = mempool.id();
|
||||
mempool_id_ = c10::cuda::MemPool::graph_pool_handle(false);
|
||||
TORCH_INTERNAL_ASSERT(mempool_id_.first > 0);
|
||||
}
|
||||
|
||||
@ -124,7 +120,7 @@ void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/, cudaStreamCaptureMode capt
|
||||
// autograd thread's free() call triggering an invalid cudaEventRecord in the caching allocator
|
||||
// due to the capture status being updated _after_ a capture had already started.
|
||||
c10::cuda::CUDACachingAllocator::beginAllocateToPool(capture_dev_, mempool_id_, [this](cudaStream_t stream) {
|
||||
cudaStreamCaptureStatus status;
|
||||
cudaStreamCaptureStatus status{};
|
||||
CaptureId_t stream_capture_id = 0;
|
||||
AT_CUDA_CHECK(cudaStreamGetCaptureInfo(stream, &status, &stream_capture_id));
|
||||
return status == cudaStreamCaptureStatus::cudaStreamCaptureStatusActive && stream_capture_id == capture_id_;
|
||||
@ -144,7 +140,7 @@ void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/, cudaStreamCaptureMode capt
|
||||
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85
|
||||
AT_CUDA_CHECK(cudaStreamBeginCapture(capture_stream_, capture_mode));
|
||||
|
||||
cudaStreamCaptureStatus status;
|
||||
cudaStreamCaptureStatus status{};
|
||||
AT_CUDA_CHECK(cudaStreamGetCaptureInfo(stream, &status, &capture_id_));
|
||||
TORCH_INTERNAL_ASSERT(status == cudaStreamCaptureStatus::cudaStreamCaptureStatusActive);
|
||||
|
||||
|
||||
@ -39,7 +39,6 @@
|
||||
|
||||
#include <sstream>
|
||||
#include <cstddef>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
|
||||
namespace c10::cuda::_internal {
|
||||
@ -61,7 +60,7 @@ namespace {
|
||||
bool _hasPrimaryContext(DeviceIndex device_index) {
|
||||
TORCH_CHECK(device_index >= 0 && device_index < at::cuda::device_count(),
|
||||
"hasPrimaryContext expects a valid device index, but got device_index=", device_index);
|
||||
unsigned int ctx_flags;
|
||||
unsigned int ctx_flags = 0;
|
||||
// In standalone tests of cuDevicePrimaryCtxGetState, I've seen the "active" argument end up with weird
|
||||
// (garbage-looking nonzero) values when the context is not active, unless I initialize it to zero.
|
||||
int ctx_is_active = 0;
|
||||
@ -103,7 +102,7 @@ void CUDAHooks::init() const {
|
||||
#endif
|
||||
}
|
||||
|
||||
const Generator& CUDAHooks::getDefaultGenerator(DeviceIndex device_index) const {
|
||||
const Generator& CUDAHooks::getDefaultCUDAGenerator(DeviceIndex device_index) const {
|
||||
return at::cuda::detail::getDefaultCUDAGenerator(device_index);
|
||||
}
|
||||
|
||||
@ -124,7 +123,7 @@ bool CUDAHooks::isPinnedPtr(const void* data) const {
|
||||
if (primary_ctx_device_index.has_value()) {
|
||||
device_guard.reset_device(at::Device(at::DeviceType::CUDA, *primary_ctx_device_index));
|
||||
}
|
||||
cudaPointerAttributes attr;
|
||||
cudaPointerAttributes attr{};
|
||||
// We do not believe that CUDA needs mutable access to the data
|
||||
// here.
|
||||
cudaError_t err = cudaPointerGetAttributes(&attr, data);
|
||||
@ -325,10 +324,10 @@ bool CUDAHooks::hasCUDART() const {
|
||||
std::string CUDAHooks::showConfig() const {
|
||||
std::ostringstream oss;
|
||||
|
||||
int runtimeVersion;
|
||||
int runtimeVersion = 0;
|
||||
cudaRuntimeGetVersion(&runtimeVersion);
|
||||
|
||||
auto printCudaStyleVersion = [&](int v) {
|
||||
auto printCudaStyleVersion = [&](size_t v) {
|
||||
#ifdef USE_ROCM
|
||||
// HIP_VERSION value format was changed after ROCm v4.2 to include the patch number
|
||||
if(v < 500) {
|
||||
@ -369,7 +368,7 @@ std::string CUDAHooks::showConfig() const {
|
||||
#if AT_CUDNN_ENABLED()
|
||||
|
||||
|
||||
auto printCudnnStyleVersion = [&](int v) {
|
||||
auto printCudnnStyleVersion = [&](size_t v) {
|
||||
oss << (v / 1000) << "." << (v / 100 % 10);
|
||||
if (v % 100 != 0) {
|
||||
oss << "." << (v % 100);
|
||||
|
||||
@ -3,7 +3,6 @@
|
||||
#include <ATen/detail/CUDAHooksInterface.h>
|
||||
|
||||
#include <ATen/Generator.h>
|
||||
#include <optional>
|
||||
|
||||
// TODO: No need to have this whole header, we can just put it all in
|
||||
// the cpp file
|
||||
@ -22,8 +21,7 @@ struct CUDAHooks : public at::CUDAHooksInterface {
|
||||
void init() const override;
|
||||
Device getDeviceFromPtr(void* data) const override;
|
||||
bool isPinnedPtr(const void* data) const override;
|
||||
const Generator& getDefaultGenerator(
|
||||
DeviceIndex device_index = -1) const override;
|
||||
const Generator& getDefaultCUDAGenerator(DeviceIndex device_index = -1) const override;
|
||||
bool hasCUDA() const override;
|
||||
bool hasMAGMA() const override;
|
||||
bool hasCuDNN() const override;
|
||||
|
||||
@ -37,7 +37,7 @@ within the next one.
|
||||
bool maybeOverlappingIndices(const TensorBase& t) {
|
||||
/* Extract size/stride arrays; only consider size >1 dims. */
|
||||
std::vector<SizeAndStride> info(t.dim());
|
||||
int dims = t.dim();
|
||||
auto dims = t.dim();
|
||||
int nonSize1Dims = 0;
|
||||
for (int i = 0; i < dims; ++i) {
|
||||
int64_t size = t.size(i);
|
||||
|
||||
@ -8,7 +8,6 @@
|
||||
|
||||
#include <iostream>
|
||||
#include <utility>
|
||||
#include <chrono>
|
||||
namespace at {
|
||||
namespace native {
|
||||
|
||||
|
||||
@ -18,7 +18,7 @@ namespace at::cuda::tunable {
|
||||
class StreamTimer : public ITimer {
|
||||
public:
|
||||
StreamTimer();
|
||||
virtual ~StreamTimer() override;
|
||||
~StreamTimer() override;
|
||||
|
||||
void Start() override;
|
||||
|
||||
|
||||
@ -19,16 +19,10 @@
|
||||
#include <cxxabi.h>
|
||||
#endif
|
||||
|
||||
#include <chrono>
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <type_traits>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <utility>
|
||||
@ -83,7 +77,7 @@ ResultEntry TuningResultsManager::Lookup(const std::string& op_signature, const
|
||||
return it->second;
|
||||
}
|
||||
|
||||
inline void TuningResultsManager::AddImpl(const std::string& op_signature,
|
||||
void TuningResultsManager::AddImpl(const std::string& op_signature,
|
||||
const std::string& params_signature,
|
||||
ResultEntry best,
|
||||
KernelMap& kernel_map) {
|
||||
@ -98,7 +92,7 @@ inline void TuningResultsManager::AddImpl(const std::string& op_signature,
|
||||
}
|
||||
|
||||
TUNABLE_LOG2(op_signature, "(", params_signature, ") -> ", best);
|
||||
kernel_map.emplace(params_signature, best);
|
||||
kernel_map.emplace(params_signature, std::move(best));
|
||||
}
|
||||
|
||||
void TuningResultsManager::Add(const std::string& op_signature, const std::string& params_signature, ResultEntry best) {
|
||||
@ -109,7 +103,7 @@ void TuningResultsManager::Add(const std::string& op_signature, const std::strin
|
||||
it = results_.insert({op_signature, {}}).first;
|
||||
}
|
||||
|
||||
AddImpl(op_signature, params_signature, best, it->second);
|
||||
AddImpl(op_signature, params_signature, std::move(best), it->second);
|
||||
}
|
||||
|
||||
void TuningResultsManager::RecordUntuned( std::ofstream& untuned_file, const std::string& op_signature, const std::string& params_signature) {
|
||||
@ -155,7 +149,7 @@ void TuningResultsManager::Delete(const std::string& op_signature, const std::st
|
||||
it->second.erase(it2);
|
||||
}
|
||||
|
||||
inline void TuningResultsManager::DisjointMergeImpl(
|
||||
void TuningResultsManager::DisjointMergeImpl(
|
||||
const std::string& op_signature,
|
||||
const KernelMap& kernel_map,
|
||||
/*out*/ std::unordered_map<std::string, KernelMap>& results) {
|
||||
@ -205,7 +199,7 @@ size_t TuningResultsManager::GetSize() {
|
||||
TuningResultsValidator::TuningResultsValidator() {
|
||||
RegisterValidator(
|
||||
"PT_VERSION",
|
||||
[this]() { return GetPyTorchVersion(); },
|
||||
[]() { return GetPyTorchVersion(); },
|
||||
[this](auto&& k) { return ValidatePyTorchVersion(std::forward<decltype(k)>(k)); });
|
||||
#ifdef USE_ROCM
|
||||
// rocm
|
||||
@ -368,7 +362,7 @@ void TuningResultsValidator::RegisterValidator(const std::string& key, const Get
|
||||
}
|
||||
}
|
||||
|
||||
std::string TuningResultsValidator::GetPyTorchVersion() const {
|
||||
std::string TuningResultsValidator::GetPyTorchVersion() {
|
||||
return TORCH_VERSION;
|
||||
}
|
||||
|
||||
@ -487,7 +481,7 @@ std::ofstream& TuningContext::GetUntunedFile(){
|
||||
std::string filename = (env == nullptr) ? "tunableop_untuned.csv" : env;
|
||||
|
||||
std::string device = c10::str(int(c10::cuda::current_device()));
|
||||
std::size_t found = filename.rfind(".");
|
||||
std::size_t found = filename.rfind('.');
|
||||
if (found != std::string::npos) {
|
||||
filename.insert(found, device);
|
||||
} else {
|
||||
|
||||
@ -10,6 +10,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <c10/util/CallOnce.h>
|
||||
#include <c10/util/StringUtil.h>
|
||||
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
@ -17,11 +18,9 @@
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace at::cuda::tunable {
|
||||
|
||||
@ -34,11 +33,11 @@ struct MaybeDelete {
|
||||
|
||||
using OstreamPtr = std::unique_ptr<std::ostream, MaybeDelete>;
|
||||
|
||||
static OstreamPtr get_stream(std::string filename) {
|
||||
if (filename.compare("out") == 0) {
|
||||
inline OstreamPtr get_stream(const std::string& filename) {
|
||||
if (filename == "out") {
|
||||
return OstreamPtr { &std::cout, MaybeDelete {false} };
|
||||
}
|
||||
else if (filename.compare("err") == 0) {
|
||||
else if (filename == "err") {
|
||||
return OstreamPtr { &std::cerr, MaybeDelete {false} };
|
||||
}
|
||||
else {
|
||||
@ -72,7 +71,7 @@ enum TORCH_CUDA_CPP_API TuningStatus {
|
||||
// Mapping from params signature to kernel id
|
||||
class TORCH_CUDA_CPP_API ResultEntry {
|
||||
public:
|
||||
explicit ResultEntry(const std::string& key, double time) : key_(key), time_(time) {}
|
||||
explicit ResultEntry(std::string key, double time) : key_(std::move(key)), time_(time) {}
|
||||
bool operator==(const ResultEntry& other) { return key_ == other.key_; }
|
||||
bool operator!=(const ResultEntry& other) { return key_ != other.key_; }
|
||||
operator std::string () { return key_; }
|
||||
@ -108,7 +107,7 @@ class TORCH_CUDA_CPP_API TuningResultsManager {
|
||||
|
||||
ResultEntry Lookup(const std::string& op_signature, const std::string& params_signature);
|
||||
|
||||
inline void AddImpl(const std::string& op_signature,
|
||||
void AddImpl(const std::string& op_signature,
|
||||
const std::string& params_signature,
|
||||
ResultEntry best,
|
||||
KernelMap& kernel_map);
|
||||
@ -119,7 +118,7 @@ class TORCH_CUDA_CPP_API TuningResultsManager {
|
||||
|
||||
void Delete(const std::string& op_signature, const std::string& params_signature);
|
||||
|
||||
inline void DisjointMergeImpl(
|
||||
void DisjointMergeImpl(
|
||||
const std::string& op_signature,
|
||||
const KernelMap& kernel_map,
|
||||
/*out*/ ResultsMap& results);
|
||||
@ -154,7 +153,7 @@ class TORCH_CUDA_CPP_API TuningResultsValidator {
|
||||
void RegisterValidator(const std::string& key, const GetFunc& gf, const ValidateFunc& vf);
|
||||
|
||||
protected:
|
||||
std::string GetPyTorchVersion() const;
|
||||
static std::string GetPyTorchVersion() ;
|
||||
TuningStatus ValidatePyTorchVersion(const std::string& value) const;
|
||||
|
||||
public:
|
||||
|
||||
@ -18,7 +18,6 @@
|
||||
#endif
|
||||
|
||||
#include <string>
|
||||
#include <type_traits>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
@ -146,7 +145,7 @@ class TunableOp {
|
||||
bool use_buffer_rotation = (rotating_size > 0);
|
||||
size_t param_size = params->GetSize(use_buffer_rotation);
|
||||
size_t param_count = (rotating_size / param_size) + 1;
|
||||
constexpr size_t MB = 1024*1024;
|
||||
constexpr size_t MB = 1024ull*1024;
|
||||
if (use_buffer_rotation) {
|
||||
TUNABLE_LOG2("Rotating buffer ", rotating_size/MB, " MiB. ",
|
||||
"Needed Size: ", param_size/MB, " MiB. ",
|
||||
@ -266,6 +265,7 @@ class TunableOp {
|
||||
std::string CreateSignature() {
|
||||
#ifndef _WIN32
|
||||
const auto* name = typeid(*this).name();
|
||||
// NOLINTNEXTLINE(*array*)
|
||||
char buf[256];
|
||||
size_t buf_len = 256;
|
||||
abi::__cxa_demangle(name, buf, &buf_len, nullptr);
|
||||
|
||||
@ -1,13 +1,9 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/core/Generator.h>
|
||||
|
||||
#include <c10/core/Allocator.h>
|
||||
#include <c10/core/Device.h>
|
||||
#include <c10/core/Stream.h>
|
||||
|
||||
#include <c10/core/Allocator.h>
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-parameter")
|
||||
|
||||
namespace at {
|
||||
|
||||
// AcceleratorHooksInterface is a shared interface provided by all
|
||||
@ -62,18 +58,7 @@ struct TORCH_API AcceleratorHooksInterface {
|
||||
virtual Device getDeviceFromPtr(void* data) const {
|
||||
TORCH_CHECK(false, "Backend doesn't support getDeviceFromPtr()");
|
||||
}
|
||||
|
||||
virtual const Generator& getDefaultGenerator(
|
||||
C10_UNUSED DeviceIndex device_index = -1) const {
|
||||
TORCH_CHECK(false, "Backend doesn`t support getDefaultGenerator()");
|
||||
}
|
||||
|
||||
virtual Generator getNewGenerator(
|
||||
C10_UNUSED DeviceIndex device_index = -1) const {
|
||||
TORCH_CHECK(false, "Backend doesn`t support getNewGenerator()");
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace at
|
||||
|
||||
C10_DIAGNOSTIC_POP()
|
||||
|
||||
@ -6,13 +6,16 @@
|
||||
|
||||
#include <ATen/detail/AcceleratorHooksInterface.h>
|
||||
|
||||
// NB: Class must live in `at` due to limitations of Registry.h.
|
||||
// Forward-declares at::Generator and at::cuda::NVRTC
|
||||
namespace at {
|
||||
|
||||
// Forward-declares at::cuda::NVRTC
|
||||
struct Generator;
|
||||
namespace cuda {
|
||||
struct NVRTC;
|
||||
} // namespace cuda
|
||||
} // namespace at
|
||||
|
||||
// NB: Class must live in `at` due to limitations of Registry.h.
|
||||
namespace at {
|
||||
|
||||
#ifdef _MSC_VER
|
||||
constexpr const char* CUDA_HELP =
|
||||
@ -66,8 +69,8 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
|
||||
TORCH_CHECK(false, "Cannot initialize CUDA without ATen_cuda library. ", CUDA_HELP);
|
||||
}
|
||||
|
||||
const Generator& getDefaultGenerator(
|
||||
[[maybe_unused]] DeviceIndex device_index = -1) const override {
|
||||
virtual const Generator& getDefaultCUDAGenerator(
|
||||
[[maybe_unused]] DeviceIndex device_index = -1) const {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"Cannot get default CUDA generator without ATen_cuda library. ",
|
||||
|
||||
@ -1,13 +1,19 @@
|
||||
#pragma once
|
||||
|
||||
#include <c10/core/Allocator.h>
|
||||
#include <c10/core/GeneratorImpl.h>
|
||||
#include <c10/util/Exception.h>
|
||||
|
||||
#include <c10/util/Registry.h>
|
||||
|
||||
#include <ATen/detail/AcceleratorHooksInterface.h>
|
||||
|
||||
#include <memory>
|
||||
|
||||
namespace at {
|
||||
class Context;
|
||||
}
|
||||
|
||||
// NB: Class must live in `at` due to limitations of Registry.h.
|
||||
namespace at {
|
||||
|
||||
@ -24,9 +30,8 @@ struct TORCH_API HIPHooksInterface : AcceleratorHooksInterface {
|
||||
TORCH_CHECK(false, "Cannot initialize HIP without ATen_hip library.");
|
||||
}
|
||||
|
||||
const Generator& getDefaultGenerator(
|
||||
C10_UNUSED DeviceIndex device_index = -1) const override {
|
||||
TORCH_CHECK(false, "Cannot initialize HIP without ATen_hip library.");
|
||||
virtual std::unique_ptr<c10::GeneratorImpl> initHIPGenerator(Context*) const {
|
||||
AT_ERROR("Cannot initialize HIP generator without ATen_hip library.");
|
||||
}
|
||||
|
||||
virtual bool hasHIP() const {
|
||||
@ -45,6 +50,10 @@ struct TORCH_API HIPHooksInterface : AcceleratorHooksInterface {
|
||||
TORCH_CHECK(false, "Pinned memory requires HIP.");
|
||||
}
|
||||
|
||||
virtual void registerHIPTypes(Context*) const {
|
||||
AT_ERROR("Cannot registerHIPTypes() without ATen_hip library.");
|
||||
}
|
||||
|
||||
virtual int getNumGPUs() const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/core/Generator.h>
|
||||
#include <ATen/detail/AcceleratorHooksInterface.h>
|
||||
|
||||
#include <c10/core/Allocator.h>
|
||||
@ -8,7 +9,7 @@
|
||||
|
||||
namespace at {
|
||||
|
||||
struct TORCH_API IPUHooksInterface : AcceleratorHooksInterface {
|
||||
struct TORCH_API IPUHooksInterface: AcceleratorHooksInterface {
|
||||
~IPUHooksInterface() override = default;
|
||||
|
||||
void init() const override {
|
||||
@ -20,14 +21,16 @@ struct TORCH_API IPUHooksInterface : AcceleratorHooksInterface {
|
||||
return false;
|
||||
}
|
||||
|
||||
const Generator& getDefaultGenerator(
|
||||
C10_UNUSED DeviceIndex device_index = -1) const override {
|
||||
TORCH_CHECK(false, "Cannot initialize IPU without ATen_ipu library.");
|
||||
virtual const Generator& getDefaultIPUGenerator(
|
||||
DeviceIndex device_index [[maybe_unused]] = -1) const {
|
||||
AT_ERROR(
|
||||
"Cannot get the default IPU generator: the IPU backend is not "
|
||||
"available.");
|
||||
}
|
||||
|
||||
Generator getNewGenerator(
|
||||
DeviceIndex device_index [[maybe_unused]] = -1) const override {
|
||||
TORCH_CHECK(false, "Cannot initialize IPU without ATen_ipu library.");
|
||||
virtual Generator newIPUGenerator(DeviceIndex device_index [[maybe_unused]] = -1) const {
|
||||
AT_ERROR(
|
||||
"Cannot create a new IPU generator: the IPU backend is not available.");
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@ -2,9 +2,9 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <ATen/detail/AcceleratorHooksInterface.h>
|
||||
|
||||
#include <c10/core/Allocator.h>
|
||||
#include <ATen/core/Generator.h>
|
||||
#include <ATen/detail/AcceleratorHooksInterface.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/Registry.h>
|
||||
|
||||
@ -31,8 +31,7 @@ struct TORCH_API MPSHooksInterface : AcceleratorHooksInterface {
|
||||
virtual bool isOnMacOSorNewer(unsigned major = 13, unsigned minor = 0) const {
|
||||
FAIL_MPSHOOKS_FUNC(__func__);
|
||||
}
|
||||
const Generator& getDefaultGenerator(
|
||||
C10_UNUSED DeviceIndex device_index = -1) const override {
|
||||
virtual const Generator& getDefaultMPSGenerator() const {
|
||||
FAIL_MPSHOOKS_FUNC(__func__);
|
||||
}
|
||||
virtual Allocator* getMPSDeviceAllocator() const {
|
||||
|
||||
@ -1,20 +1,18 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/core/Generator.h>
|
||||
#include <ATen/detail/AcceleratorHooksInterface.h>
|
||||
#include <c10/core/Allocator.h>
|
||||
#include <c10/core/Device.h>
|
||||
#include <c10/core/Storage.h>
|
||||
#include <c10/util/Exception.h>
|
||||
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-parameter")
|
||||
|
||||
namespace at {
|
||||
|
||||
struct TORCH_API PrivateUse1HooksInterface : AcceleratorHooksInterface {
|
||||
~PrivateUse1HooksInterface() override = default;
|
||||
|
||||
const at::Generator& getDefaultGenerator(
|
||||
c10::DeviceIndex device_index) const override {
|
||||
virtual const at::Generator& getDefaultGenerator(
|
||||
c10::DeviceIndex device_index) const {
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||
false,
|
||||
"You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `getDefaultGenerator`.");
|
||||
@ -26,17 +24,17 @@ struct TORCH_API PrivateUse1HooksInterface : AcceleratorHooksInterface {
|
||||
"You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `getDeviceFromPtr`.");
|
||||
}
|
||||
|
||||
bool isPinnedPtr(const void* data) const override {
|
||||
virtual bool isPinnedPtr(const void* data) const override {
|
||||
return false;
|
||||
}
|
||||
|
||||
Allocator* getPinnedMemoryAllocator() const override {
|
||||
virtual Allocator* getPinnedMemoryAllocator() const override {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `getPinnedMemoryAllocator`.");
|
||||
}
|
||||
|
||||
bool hasPrimaryContext(DeviceIndex device_index) const override {
|
||||
virtual bool hasPrimaryContext(DeviceIndex device_index) const override {
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(
|
||||
false,
|
||||
"You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `hasPrimaryContext`.");
|
||||
|
||||
@ -4,6 +4,7 @@
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/Registry.h>
|
||||
|
||||
#include <ATen/core/Generator.h>
|
||||
#include <ATen/detail/AcceleratorHooksInterface.h>
|
||||
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-parameter")
|
||||
@ -31,15 +32,15 @@ struct TORCH_API XPUHooksInterface : AcceleratorHooksInterface{
|
||||
TORCH_CHECK(false, "Cannot get XPU global device index without ATen_xpu library.");
|
||||
}
|
||||
|
||||
const Generator& getDefaultGenerator(
|
||||
[[maybe_unused]] DeviceIndex device_index = -1) const override {
|
||||
TORCH_CHECK(
|
||||
false, "Cannot get default XPU generator without ATen_xpu library.");
|
||||
virtual Generator getXPUGenerator(
|
||||
[[maybe_unused]] DeviceIndex device_index = -1) const {
|
||||
TORCH_CHECK(false, "Cannot get XPU generator without ATen_xpu library.");
|
||||
}
|
||||
|
||||
Generator getNewGenerator(
|
||||
[[maybe_unused]] DeviceIndex device_index = -1) const override {
|
||||
TORCH_CHECK(false, "Cannot get XPU generator without ATen_xpu library.");
|
||||
virtual const Generator& getDefaultXPUGenerator(
|
||||
[[maybe_unused]] DeviceIndex device_index = -1) const {
|
||||
TORCH_CHECK(
|
||||
false, "Cannot get default XPU generator without ATen_xpu library.");
|
||||
}
|
||||
|
||||
virtual DeviceIndex getNumGPUs() const {
|
||||
|
||||
@ -19,8 +19,7 @@ struct MPSHooks : public at::MPSHooksInterface {
|
||||
bool isOnMacOSorNewer(unsigned major, unsigned minor) const override;
|
||||
|
||||
// MPSGeneratorImpl interface
|
||||
const Generator& getDefaultGenerator(
|
||||
DeviceIndex device_index = -1) const override;
|
||||
const Generator& getDefaultMPSGenerator() const override;
|
||||
|
||||
// MPSStream interface
|
||||
void deviceSynchronize() const override;
|
||||
|
||||
@ -59,7 +59,7 @@ Allocator* MPSHooks::getMPSDeviceAllocator() const {
|
||||
return at::mps::GetMPSAllocator();
|
||||
}
|
||||
|
||||
const Generator& MPSHooks::getDefaultGenerator([[maybe_unused]] DeviceIndex device_index) const {
|
||||
const Generator& MPSHooks::getDefaultMPSGenerator() const {
|
||||
return at::mps::detail::getDefaultMPSGenerator();
|
||||
}
|
||||
|
||||
|
||||
@ -26,6 +26,7 @@
|
||||
#include <ATen/native/cpu/SerialStackImpl.h>
|
||||
#include <ATen/native/cpu/StackKernel.h>
|
||||
#include <ATen/quantized/QTensorImpl.h>
|
||||
#include <c10/core/GradMode.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <optional>
|
||||
#include <c10/util/SmallVector.h>
|
||||
@ -4071,29 +4072,41 @@ void split_copy_Tensor_out(const at::Tensor & self, int64_t split_size, int64_t
|
||||
}
|
||||
}
|
||||
|
||||
void split_with_sizes_copy_out(const at::Tensor & self, at::IntArrayRef split_sizes, int64_t dim, at::TensorList out) {
|
||||
auto tmp = self.split_with_sizes(split_sizes, dim);
|
||||
namespace {
|
||||
|
||||
TORCH_CHECK(out.size() == tmp.size(), "split_with_sizes_copy_out() expected an out= argument of size ", tmp.size(), ", got size ", out.size());
|
||||
void copy_tensor_array_to_out(const char* name, const std::vector<Tensor>& array, at::TensorList out) {
|
||||
TORCH_CHECK(out.size() == array.size(), name, " expected an out= argument of size ", array.size(), ", got size ", out.size());
|
||||
for (const auto i : c10::irange(out.size())) {
|
||||
if (resize_output_check(out[i], tmp[i].sizes())) {
|
||||
out[i].resize_(tmp[i].sizes());
|
||||
if (resize_output_check(out[i], array[i].sizes())) {
|
||||
out[i].resize_(array[i].sizes());
|
||||
}
|
||||
TORCH_CHECK(out[i].dtype() == tmp[i].dtype(),
|
||||
"Expected out tensor to have dtype ", tmp[i].dtype(), ", but got ", out[i].dtype(), " instead");
|
||||
TORCH_CHECK(out[i].device() == tmp[i].device(),
|
||||
"Expected out tensor to have device ", tmp[i].device(), ", but got ", out[i].device(), " instead");
|
||||
out[i].copy_(tmp[i]);
|
||||
TORCH_CHECK(out[i].dtype() == array[i].dtype(),
|
||||
"Expected out tensor to have dtype ", array[i].dtype(), ", but got ", out[i].dtype(), " instead");
|
||||
TORCH_CHECK(out[i].device() == array[i].device(),
|
||||
"Expected out tensor to have device ", array[i].device(), ", but got ", out[i].device(), " instead");
|
||||
out[i].copy_(array[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void unbind_copy_int_out(const at::Tensor & self, int64_t dim, at::TensorList out) {
|
||||
auto tmp = self.unbind(dim);
|
||||
}
|
||||
|
||||
TORCH_CHECK(out.size() == tmp.size(), "unbind_copy_int_out() expected an out= argument of size ", tmp.size(), ", got size ", out.size());
|
||||
for (const auto i : c10::irange(out.size())) {
|
||||
out[i].copy_(tmp[i]);
|
||||
void split_with_sizes_copy_out(const at::Tensor & self, at::IntArrayRef split_sizes, int64_t dim, at::TensorList out) {
|
||||
auto tmp = self.split_with_sizes(split_sizes, dim);
|
||||
copy_tensor_array_to_out("split_with_sizes_copy_out()", tmp, out);
|
||||
}
|
||||
|
||||
void unbind_copy_int_out(const at::Tensor & self, int64_t dim, at::TensorList out) {
|
||||
if (at::GradMode::is_enabled()) {
|
||||
for (const auto i : c10::irange(out.size())) {
|
||||
TORCH_CHECK(!out[i].requires_grad(),
|
||||
"unbind_copy(): functions with out=... arguments don't support automatic differentiation, "
|
||||
"but one of the arguments requires grad."
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
auto tmp = self.unbind(dim);
|
||||
copy_tensor_array_to_out("unbind_copy_int_out()", tmp, out);
|
||||
}
|
||||
|
||||
int64_t sparse_dim_default(const Tensor& self) {
|
||||
|
||||
@ -36,6 +36,7 @@
|
||||
#include <ATen/native/TensorIteratorDynamicCasting.h>
|
||||
#include <ATen/cpu/vec/vec.h>
|
||||
|
||||
#include <tuple>
|
||||
#include <utility>
|
||||
|
||||
namespace at::native { inline namespace CPU_CAPABILITY {
|
||||
|
||||
@ -6,10 +6,9 @@
|
||||
#include <c10/core/Scalar.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#include <sstream>
|
||||
#include <type_traits>
|
||||
|
||||
namespace at { namespace native { inline namespace CPU_CAPABILITY {
|
||||
namespace at::native { inline namespace CPU_CAPABILITY {
|
||||
|
||||
using namespace vec;
|
||||
|
||||
@ -308,4 +307,4 @@ void binary_kernel_reduce_lastdim(TensorIteratorBase& iter, reduce_func_t reduce
|
||||
sub_iter.for_each(loop, grain_size);
|
||||
}
|
||||
|
||||
}}} // namespace at::native::<anonymous>
|
||||
}} // namespace at::native::<anonymous>
|
||||
|
||||
@ -124,6 +124,55 @@ __global__ void indexing_backward_kernel(
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef USE_ROCM
|
||||
template <typename scalar_t, bool accumulate>
|
||||
__global__ void indexing_backward_kernel_rocm(
|
||||
const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight,
|
||||
int64_t numel, int64_t stride, int64_t stride_before, int64_t outer_dim) {
|
||||
|
||||
// This implementation is adopted from indexing_backward_kernel above.
|
||||
using opmath_t = at::opmath_type<scalar_t>;
|
||||
for (int64_t z = blockIdx.z; z < outer_dim; z += gridDim.z){
|
||||
int64_t idx = blockIdx.x * blockDim.y + threadIdx.y;
|
||||
if (idx < numel && (idx == 0 || sorted_indices[idx] != sorted_indices[idx - 1])){
|
||||
do {
|
||||
// if not accumulate, we only keep the last duplicate index so skip those before it
|
||||
if constexpr (!accumulate) {
|
||||
if ((idx < numel - 1) && sorted_indices[idx] == sorted_indices[idx + 1]) {
|
||||
idx++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
const int64_t weight_row = ((int64_t) sorted_indices[idx]) * stride + z * stride_before;
|
||||
const int64_t grad_row = ((int64_t) indices[idx]) * stride + z * numel * stride;
|
||||
|
||||
opmath_t gradient;
|
||||
opmath_t weight;
|
||||
|
||||
int64_t feature_dim = threadIdx.x + blockIdx.y * blockDim.x;
|
||||
while (feature_dim < stride) {
|
||||
gradient = static_cast<opmath_t>(grad_output[grad_row + feature_dim]);
|
||||
if constexpr (accumulate) {
|
||||
weight = static_cast<opmath_t>(grad_weight[weight_row + feature_dim]);
|
||||
}
|
||||
|
||||
if constexpr (accumulate) {
|
||||
weight += gradient;
|
||||
} else {
|
||||
weight = gradient;
|
||||
}
|
||||
|
||||
grad_weight[weight_row + feature_dim] = static_cast<scalar_t>(weight);
|
||||
feature_dim += gridDim.y * blockDim.x;
|
||||
}
|
||||
|
||||
idx++;
|
||||
} while (idx < numel && sorted_indices[idx] == sorted_indices[idx - 1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename scalar_t>
|
||||
__global__ void indexing_backward_kernel_stride_1(
|
||||
const int64_t* sorted_indices, const int64_t* indices, const scalar_t* grad_output, scalar_t* grad_weight,
|
||||
@ -491,7 +540,11 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
|
||||
linearIndex.numel()*sliceSize*nElemBefore == expandedValue.numel(),
|
||||
"number of flattened indices did not match number of elements in the value tensor: ",
|
||||
linearIndex.numel()*sliceSize*nElemBefore, " vs ", expandedValue.numel());
|
||||
#ifdef USE_ROCM
|
||||
const int UNROLL = 1;
|
||||
#else
|
||||
const int UNROLL = 4;
|
||||
#endif
|
||||
const int indices_per_block = 4;
|
||||
const int warp_size = at::cuda::warp_size();
|
||||
dim3 grid(ceil_div(num_indices, (int64_t) indices_per_block),
|
||||
@ -549,6 +602,54 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
|
||||
kHalf,
|
||||
kBool,
|
||||
kBFloat16);
|
||||
#ifdef USE_ROCM
|
||||
} else if (UNROLL == 1) {
|
||||
if (accumulate) {
|
||||
AT_DISPATCH_V2(
|
||||
expandedValue.scalar_type(),
|
||||
"indexing_backward",
|
||||
AT_WRAP([&] {
|
||||
indexing_backward_kernel_rocm<scalar_t, true><<<grid, block, 0, stream>>>(
|
||||
sorted_indices.const_data_ptr<int64_t>(),
|
||||
orig_indices.const_data_ptr<int64_t>(),
|
||||
expandedValue.const_data_ptr<scalar_t>(),
|
||||
src_.mutable_data_ptr<scalar_t>(),
|
||||
num_indices,
|
||||
sliceSize,
|
||||
strideBefore,
|
||||
nElemBefore);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}),
|
||||
AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
|
||||
AT_EXPAND(AT_FLOAT8_TYPES),
|
||||
kComplexHalf,
|
||||
kHalf,
|
||||
kBool,
|
||||
kBFloat16);
|
||||
} else {
|
||||
AT_DISPATCH_V2(
|
||||
expandedValue.scalar_type(),
|
||||
"indexing_backward",
|
||||
AT_WRAP([&] {
|
||||
indexing_backward_kernel_rocm<scalar_t, false><<<grid, block, 0, stream>>>(
|
||||
sorted_indices.const_data_ptr<int64_t>(),
|
||||
orig_indices.const_data_ptr<int64_t>(),
|
||||
expandedValue.const_data_ptr<scalar_t>(),
|
||||
src_.mutable_data_ptr<scalar_t>(),
|
||||
num_indices,
|
||||
sliceSize,
|
||||
strideBefore,
|
||||
nElemBefore);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}),
|
||||
AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX),
|
||||
AT_EXPAND(AT_FLOAT8_TYPES),
|
||||
kComplexHalf,
|
||||
kHalf,
|
||||
kBool,
|
||||
kBFloat16);
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
AT_DISPATCH_V2(
|
||||
expandedValue.scalar_type(),
|
||||
@ -572,8 +673,8 @@ void index_put_with_sort_kernel(Tensor & self, const c10::List<std::optional<Ten
|
||||
kHalf,
|
||||
kBool,
|
||||
kBFloat16);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (permuted) {
|
||||
self.copy_(src_.permute(inversePerm));
|
||||
|
||||
@ -11,6 +11,7 @@
|
||||
|
||||
#include <ATen/native/cuda/MemoryAccess.cuh>
|
||||
|
||||
#include <tuple>
|
||||
|
||||
namespace at::native {
|
||||
|
||||
|
||||
@ -2363,7 +2363,8 @@ DropoutState& get_dropout_state(
|
||||
std::unique_lock<std::mutex> lock{state_cache_mut};
|
||||
auto& state = dropout_state_cache.at(device);
|
||||
if (train && dropout_p > 0) {
|
||||
const auto& gen = at::detail::getCUDAHooks().getDefaultGenerator(device);
|
||||
const auto& gen =
|
||||
at::detail::getCUDAHooks().getDefaultCUDAGenerator(device);
|
||||
auto gen_impl = gen.get<at::CUDAGeneratorImpl>();
|
||||
bool reset_rnn_state = gen_impl->reset_rnn_state();
|
||||
if (!state.buffer.defined() || reset_rnn_state) {
|
||||
|
||||
@ -3357,7 +3357,7 @@
|
||||
dispatch:
|
||||
CUDA: _cslt_compress
|
||||
|
||||
- func: _cslt_sparse_mm(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False, int alg_id=0) -> Tensor
|
||||
- func: _cslt_sparse_mm(Tensor compressed_A, Tensor dense_B, Tensor? bias=None, Tensor? alpha=None, ScalarType? out_dtype=None, bool transpose_result=False, int alg_id=0, int split_k=1, bool split_k_one_kernel=True) -> Tensor
|
||||
dispatch:
|
||||
CUDA: _cslt_sparse_mm
|
||||
|
||||
|
||||
@ -22,6 +22,7 @@
|
||||
#include <ATen/ops/tensor.h>
|
||||
#endif
|
||||
|
||||
#include <tuple>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
|
||||
@ -1,20 +1,7 @@
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <ATen/cuda/CUDADataType.h>
|
||||
#include <ATen/cuda/CUDASparse.h>
|
||||
#include <ATen/cuda/CUDAConfig.h>
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/Functions.h>
|
||||
#include <c10/core/ScalarType.h>
|
||||
#include <c10/cuda/CUDACachingAllocator.h>
|
||||
#include <c10/util/Half.h>
|
||||
#include <cusparse.h>
|
||||
#include <cstdint>
|
||||
#include <ATen/native/sparse/cuda/cuSPARSELtOps.h>
|
||||
|
||||
#if AT_CUSPARSELT_ENABLED()
|
||||
|
||||
#include <cusparseLt.h>
|
||||
|
||||
namespace at::native {
|
||||
|
||||
// Ideally we would use the same DeviceThreadHandlePool mechanism as used in aten/src/ATen/cuda/CuSparseHandlePool.cpp
|
||||
@ -56,6 +43,7 @@ at::Tensor _cslt_compress(const Tensor& sparse_input)
|
||||
#if defined(CUSPARSELT_VERSION) && CUSPARSELT_VERSION >= 602
|
||||
case at::ScalarType::Float8_e4m3fn:
|
||||
type = CUDA_R_8F_E4M3;
|
||||
compression_factor = 10;
|
||||
break;
|
||||
#endif
|
||||
default:
|
||||
@ -103,7 +91,7 @@ at::Tensor _cslt_compress(const Tensor& sparse_input)
|
||||
return compressed_tensor;
|
||||
}
|
||||
|
||||
std::tuple<int64_t, at::Tensor> _cslt_sparse_mm_impl(
|
||||
std::tuple<at::Tensor, int64_t, int64_t, bool, int64_t> _cslt_sparse_mm_impl(
|
||||
const Tensor& compressed_A,
|
||||
const Tensor& dense_B,
|
||||
const std::optional<Tensor>& bias_opt,
|
||||
@ -111,6 +99,8 @@ std::tuple<int64_t, at::Tensor> _cslt_sparse_mm_impl(
|
||||
const std::optional<c10::ScalarType> out_dtype_opt,
|
||||
bool transpose_result,
|
||||
int alg_id,
|
||||
int split_k,
|
||||
bool split_k_one_kernel,
|
||||
bool search_alg_id
|
||||
)
|
||||
{
|
||||
@ -169,6 +159,7 @@ std::tuple<int64_t, at::Tensor> _cslt_sparse_mm_impl(
|
||||
output_type = CUDA_R_8F_E4M3;
|
||||
C_type = CUDA_R_16F;
|
||||
compute_type = CUSPARSE_COMPUTE_32F;
|
||||
compression_factor = 10;
|
||||
break;
|
||||
#endif
|
||||
// cuSPARSELt <= v0.5.2 uses CUSPARSE_COMPUTE_TF32, CUSPARSE_COMPUTE_16F
|
||||
@ -335,10 +326,21 @@ std::tuple<int64_t, at::Tensor> _cslt_sparse_mm_impl(
|
||||
TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSelectionInit(
|
||||
&handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT));
|
||||
|
||||
// set alg_id
|
||||
// set matmul search params
|
||||
TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSetAttribute(
|
||||
&handle, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg_id, sizeof(alg_id)));
|
||||
|
||||
cusparseLtSplitKMode_t splitKMode;
|
||||
int max_alg_id;
|
||||
if (split_k != 1) {
|
||||
TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSetAttribute(
|
||||
&handle, &alg_sel, CUSPARSELT_MATMUL_SPLIT_K, &split_k, sizeof(split_k)));
|
||||
|
||||
splitKMode = split_k_one_kernel ? CUSPARSELT_SPLIT_K_MODE_ONE_KERNEL : CUSPARSELT_SPLIT_K_MODE_TWO_KERNELS;
|
||||
TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgSetAttribute(
|
||||
&handle, &alg_sel, CUSPARSELT_MATMUL_SPLIT_K_MODE, &splitKMode, sizeof(splitKMode)));
|
||||
}
|
||||
|
||||
// set tensor_alpha_mode and alpha pointer for matmul
|
||||
const auto alpha_tensor = alpha_opt.has_value() ? *alpha_opt: Tensor{};
|
||||
auto alpha_ptr = α
|
||||
@ -381,9 +383,23 @@ std::tuple<int64_t, at::Tensor> _cslt_sparse_mm_impl(
|
||||
&stream,
|
||||
1));
|
||||
|
||||
// get alg_id used
|
||||
// get matmul params used
|
||||
TORCH_CUDASPARSE_CHECK(cusparseLtMatmulAlgGetAttribute(
|
||||
&handle, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg_id, sizeof(alg_id)));
|
||||
|
||||
TORCH_CUDASPARSE_CHECK( cusparseLtMatmulAlgGetAttribute(&handle, &alg_sel,
|
||||
CUSPARSELT_MATMUL_SPLIT_K,
|
||||
&split_k, sizeof(split_k)));
|
||||
|
||||
TORCH_CUDASPARSE_CHECK( cusparseLtMatmulAlgGetAttribute(&handle, &alg_sel,
|
||||
CUSPARSELT_MATMUL_SPLIT_K_MODE,
|
||||
&splitKMode, sizeof(splitKMode)));
|
||||
|
||||
TORCH_CUDASPARSE_CHECK( cusparseLtMatmulAlgGetAttribute(&handle, &alg_sel,
|
||||
CUSPARSELT_MATMUL_ALG_CONFIG_MAX_ID,
|
||||
&max_alg_id, sizeof(max_alg_id)));
|
||||
|
||||
|
||||
}
|
||||
else {
|
||||
// do normal matmul
|
||||
@ -411,7 +427,7 @@ std::tuple<int64_t, at::Tensor> _cslt_sparse_mm_impl(
|
||||
// destroy plan
|
||||
TORCH_CUDASPARSE_CHECK(cusparseLtMatmulPlanDestroy(&plan));
|
||||
|
||||
return {alg_id, res};
|
||||
return {res, alg_id, split_k, splitKMode == CUSPARSELT_SPLIT_K_MODE_ONE_KERNEL, max_alg_id};
|
||||
}
|
||||
|
||||
at::Tensor _cslt_sparse_mm(
|
||||
@ -421,7 +437,9 @@ at::Tensor _cslt_sparse_mm(
|
||||
const std::optional<Tensor>& alpha_opt,
|
||||
const std::optional<c10::ScalarType> out_dtype_opt,
|
||||
bool transpose_result,
|
||||
int64_t alg_id
|
||||
int64_t alg_id,
|
||||
int64_t split_k,
|
||||
bool split_k_one_kernel
|
||||
)
|
||||
{
|
||||
auto result = _cslt_sparse_mm_impl(
|
||||
@ -432,8 +450,10 @@ at::Tensor _cslt_sparse_mm(
|
||||
out_dtype_opt,
|
||||
transpose_result,
|
||||
(int) alg_id,
|
||||
(int) split_k,
|
||||
split_k_one_kernel,
|
||||
false);
|
||||
return std::get<1>(result);
|
||||
return std::get<0>(result);
|
||||
}
|
||||
|
||||
int64_t _cslt_sparse_mm_search(
|
||||
@ -445,7 +465,10 @@ int64_t _cslt_sparse_mm_search(
|
||||
bool transpose_result
|
||||
)
|
||||
{
|
||||
TORCH_WARN_ONCE("torch._cslt_sparse_mm_search is deprecated and will be removed in a future PyTorch release. Please use torch._C._cusparselt.mm_search instead.");
|
||||
int alg_id_int = 0;
|
||||
int split_k = 1;
|
||||
bool split_k_one_kernel= true;
|
||||
auto result = _cslt_sparse_mm_impl(
|
||||
compressed_A,
|
||||
dense_B,
|
||||
@ -454,11 +477,12 @@ int64_t _cslt_sparse_mm_search(
|
||||
out_dtype_opt,
|
||||
transpose_result,
|
||||
alg_id_int,
|
||||
split_k,
|
||||
split_k_one_kernel,
|
||||
true);
|
||||
return (int64_t) std::get<0>(result);
|
||||
return (int64_t) std::get<1>(result);
|
||||
}
|
||||
|
||||
|
||||
} // namespace at::native
|
||||
|
||||
#else // No cuSPARSELt support, throw error if these functions are called.
|
||||
@ -476,7 +500,9 @@ at::Tensor _cslt_sparse_mm(
|
||||
const std::optional<Tensor>& alpha_opt,
|
||||
const std::optional<c10::ScalarType> out_dtype,
|
||||
bool transpose_result,
|
||||
int64_t alg_id)
|
||||
int64_t alg_id,
|
||||
int64_t split_k,
|
||||
bool split_k_one_kernel)
|
||||
{
|
||||
TORCH_CHECK(false, "cuSPARSELt not supported on your machine.");
|
||||
}
|
||||
|
||||
58
aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.h
Normal file
58
aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.h
Normal file
@ -0,0 +1,58 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/cuda/CUDAContext.h>
|
||||
#include <ATen/cuda/CUDADataType.h>
|
||||
#include <ATen/cuda/CUDASparse.h>
|
||||
#include <ATen/cuda/CUDAConfig.h>
|
||||
#include <ATen/core/Tensor.h>
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/Functions.h>
|
||||
#include <c10/core/ScalarType.h>
|
||||
#include <c10/cuda/CUDACachingAllocator.h>
|
||||
#include <c10/util/Half.h>
|
||||
#include <cusparse.h>
|
||||
#include <cstdint>
|
||||
|
||||
#if AT_CUSPARSELT_ENABLED()
|
||||
#include <cusparseLt.h>
|
||||
#endif
|
||||
|
||||
namespace at::native {
|
||||
|
||||
at::Tensor _cslt_compress(const Tensor& sparse_input);
|
||||
|
||||
TORCH_CUDA_CPP_API std::tuple<at::Tensor, int64_t, int64_t, bool, int64_t> _cslt_sparse_mm_impl(
|
||||
const Tensor& compressed_A,
|
||||
const Tensor& dense_B,
|
||||
const std::optional<Tensor>& bias_opt,
|
||||
const std::optional<Tensor>& alpha_opt,
|
||||
const std::optional<c10::ScalarType> out_dtype_opt,
|
||||
bool transpose_result,
|
||||
int alg_id,
|
||||
int split_k,
|
||||
bool split_k_one_kernel,
|
||||
bool search_alg_id
|
||||
);
|
||||
|
||||
at::Tensor _cslt_sparse_mm(
|
||||
const Tensor& compressed_A,
|
||||
const Tensor& dense_B,
|
||||
const std::optional<Tensor>& bias_opt,
|
||||
const std::optional<Tensor>& alpha_opt,
|
||||
const std::optional<c10::ScalarType> out_dtype_opt,
|
||||
bool transpose_result,
|
||||
int64_t alg_id,
|
||||
int64_t split_k,
|
||||
bool split_k_one_kernel
|
||||
);
|
||||
|
||||
int64_t _cslt_sparse_mm_search(
|
||||
const Tensor& compressed_A,
|
||||
const Tensor& dense_B,
|
||||
const std::optional<Tensor>& bias_opt,
|
||||
const std::optional<Tensor>& alpha_opt,
|
||||
const std::optional<c10::ScalarType> out_dtype_opt,
|
||||
bool transpose_result
|
||||
);
|
||||
|
||||
} // namespace at::native
|
||||
@ -68,16 +68,11 @@ bool check_prefer_cudnn_attention() {
|
||||
std::array<SDPBackend, num_backends> priority_order(sdp_params const& params) {
|
||||
constexpr std::array<SDPBackend, num_backends> default_order{
|
||||
SDPBackend::flash_attention,
|
||||
SDPBackend::cudnn_attention,
|
||||
SDPBackend::efficient_attention,
|
||||
SDPBackend::math};
|
||||
constexpr std::array<SDPBackend, num_backends> cudnn_order{
|
||||
SDPBackend::math,
|
||||
SDPBackend::cudnn_attention,
|
||||
SDPBackend::flash_attention,
|
||||
SDPBackend::efficient_attention,
|
||||
SDPBackend::math};
|
||||
static const bool prefer_cudnn = check_prefer_cudnn_attention();
|
||||
return prefer_cudnn ? cudnn_order : default_order;
|
||||
};
|
||||
return default_order;
|
||||
}
|
||||
|
||||
bool use_tensor_cores(sdp_params const& params, cudaDeviceProp* dprops, bool is_half) {
|
||||
|
||||
@ -34,12 +34,13 @@ int32_t XPUHooks::getGlobalIdxFromDevice(const at::Device& device) const {
|
||||
#endif
|
||||
}
|
||||
|
||||
const Generator& XPUHooks::getDefaultGenerator(DeviceIndex device_index) const {
|
||||
return at::xpu::detail::getDefaultXPUGenerator(device_index);
|
||||
Generator XPUHooks::getXPUGenerator(DeviceIndex device_index) const {
|
||||
return make_generator<at::XPUGeneratorImpl>(device_index);
|
||||
}
|
||||
|
||||
Generator XPUHooks::getNewGenerator(DeviceIndex device_index) const {
|
||||
return make_generator<at::XPUGeneratorImpl>(device_index);
|
||||
const Generator& XPUHooks::getDefaultXPUGenerator(
|
||||
DeviceIndex device_index) const {
|
||||
return at::xpu::detail::getDefaultXPUGenerator(device_index);
|
||||
}
|
||||
|
||||
Device XPUHooks::getDeviceFromPtr(void* data) const {
|
||||
|
||||
@ -11,9 +11,9 @@ struct XPUHooks : public at::XPUHooksInterface {
|
||||
bool hasXPU() const override;
|
||||
std::string showConfig() const override;
|
||||
int32_t getGlobalIdxFromDevice(const at::Device& device) const override;
|
||||
const Generator& getDefaultGenerator(
|
||||
Generator getXPUGenerator(DeviceIndex device_index = -1) const override;
|
||||
const Generator& getDefaultXPUGenerator(
|
||||
DeviceIndex device_index = -1) const override;
|
||||
Generator getNewGenerator(DeviceIndex device_index = -1) const override;
|
||||
Device getDeviceFromPtr(void* data) const override;
|
||||
c10::DeviceIndex getNumGPUs() const override;
|
||||
DeviceIndex current_device() const override;
|
||||
|
||||
@ -0,0 +1,44 @@
|
||||
import sys
|
||||
|
||||
from benchmark_base import BenchmarkBase
|
||||
|
||||
import torch
|
||||
|
||||
|
||||
class Benchmark(BenchmarkBase):
|
||||
N = 200
|
||||
|
||||
def name(self):
|
||||
return "symint_sum"
|
||||
|
||||
def description(self):
|
||||
return "see https://docs.google.com/document/d/11xJXl1etSmefUxPiVyk885e0Dl-4o7QwxYcPiMIo2iY/edit"
|
||||
|
||||
def _prepare_once(self):
|
||||
torch._dynamo.config.capture_scalar_outputs = True
|
||||
torch.manual_seed(0)
|
||||
|
||||
self.splits = torch.randint(10, (self.N,))
|
||||
|
||||
def _prepare(self):
|
||||
torch._dynamo.reset()
|
||||
|
||||
def _work(self):
|
||||
@torch.compile(fullgraph=True)
|
||||
def f(a):
|
||||
xs = a.tolist()
|
||||
y = sum(xs)
|
||||
return torch.tensor(y)
|
||||
|
||||
f(self.splits)
|
||||
|
||||
|
||||
def main():
|
||||
result_path = sys.argv[1]
|
||||
Benchmark().enable_compile_time_instruction_count().collect_all().append_results(
|
||||
result_path
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -1,253 +0,0 @@
|
||||
import argparse
|
||||
import random
|
||||
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
|
||||
import torch
|
||||
import torch.utils.benchmark as benchmark
|
||||
from torch import nn
|
||||
from torch.sparse import SparseSemiStructuredTensor, to_sparse_semi_structured
|
||||
|
||||
|
||||
torch.set_printoptions(
|
||||
precision=2,
|
||||
threshold=None,
|
||||
edgeitems=16,
|
||||
linewidth=480,
|
||||
profile=None,
|
||||
sci_mode=False,
|
||||
)
|
||||
|
||||
|
||||
# helper model definition for pruner
|
||||
class Model(nn.Module):
|
||||
def __init__(self, m, k, dtype=None):
|
||||
super().__init__()
|
||||
# transposed so reversed
|
||||
self.linear = nn.Linear(k, m)
|
||||
|
||||
def forward(self, x):
|
||||
return self.linear(x)
|
||||
|
||||
|
||||
def rand_sparse_semi_structured_mask(
|
||||
r, c, dtype=torch.float16, device="cuda", choice=None
|
||||
):
|
||||
"""
|
||||
This function returns a 1:2 sparse matrix of size (r, c).
|
||||
Note that this means this matrix will also be 2:4 and 4:8 sparse as well.
|
||||
"""
|
||||
|
||||
choices = [[0, 1], [1, 0]]
|
||||
mask_entries = [choice or random.choice(choices) for i in range(r * c // 2)]
|
||||
|
||||
return (
|
||||
torch.tensor(mask_entries, dtype=dtype, device=device)
|
||||
.reshape(r, c)
|
||||
.contiguous()
|
||||
)
|
||||
|
||||
|
||||
def test_linear(m, k, n, dtype, contiguous, backend):
|
||||
SparseSemiStructuredTensor._FORCE_CUTLASS = backend == "cutlass"
|
||||
mask = rand_sparse_semi_structured_mask(m, k, dtype=dtype)
|
||||
sparse_weight = torch.rand(m, k).to(dtype).cuda() * mask
|
||||
input_tensor = torch.zeros(n, k).to(dtype).cuda()
|
||||
model = Model(m, k).to(dtype).cuda().eval()
|
||||
|
||||
dense_measurement = benchmark.Timer(
|
||||
stmt="model(input_tensor)",
|
||||
globals=locals(),
|
||||
).blocked_autorange()
|
||||
|
||||
dense_output = model(input_tensor)
|
||||
print(dense_output.shape)
|
||||
|
||||
# sparsify weights
|
||||
model.linear.weight = nn.Parameter(
|
||||
to_sparse_semi_structured(
|
||||
sparse_weight,
|
||||
)
|
||||
)
|
||||
|
||||
sparse_output = model(input_tensor)
|
||||
print(sparse_output.shape)
|
||||
|
||||
sparse_measurement = benchmark.Timer(
|
||||
stmt="model(input_tensor)",
|
||||
globals=locals(),
|
||||
).blocked_autorange()
|
||||
|
||||
correct = torch.allclose(dense_output, sparse_output, rtol=1e-3, atol=1e-3)
|
||||
|
||||
return {
|
||||
"test_function": "linear",
|
||||
"m": m,
|
||||
"k": k,
|
||||
"n": n,
|
||||
"dtype": str(dtype),
|
||||
"backend": backend,
|
||||
"sparse_latency (ms)": sparse_measurement.median * 1000,
|
||||
"dense_latency (ms)": dense_measurement.median * 1000,
|
||||
"speedup (d/s)": dense_measurement.median / sparse_measurement.median,
|
||||
"correct": correct,
|
||||
"contiguous": sparse_output.is_contiguous(),
|
||||
}
|
||||
|
||||
|
||||
def test_tensor(m, k, n, dtype, contiguous, backend):
|
||||
A = rand_sparse_semi_structured_mask(m, k, dtype=dtype)
|
||||
B = torch.zeros(k, n).to(dtype).cuda()
|
||||
bias = torch.rand(n).to(dtype).cuda()
|
||||
|
||||
sA = to_sparse_semi_structured(A)
|
||||
|
||||
# torch.mm calculation
|
||||
if dtype is not torch.int8:
|
||||
dense_output = torch.mm(A, B)
|
||||
|
||||
dense_measurement = benchmark.Timer(
|
||||
stmt="torch.mm(A, B)",
|
||||
globals=locals(),
|
||||
).blocked_autorange()
|
||||
|
||||
else:
|
||||
print("int8 baseline not supported")
|
||||
dense_output = torch.mm(sA, B)
|
||||
|
||||
dense_measurement = benchmark.Timer(
|
||||
stmt="torch.mm(sA, B)",
|
||||
globals=locals(),
|
||||
).blocked_autorange()
|
||||
|
||||
sparse_output = torch.mm(sA, B)
|
||||
sparse_measurement = benchmark.Timer(
|
||||
stmt="torch.mm(sA, B)",
|
||||
globals=locals(),
|
||||
).blocked_autorange()
|
||||
|
||||
correct = torch.allclose(dense_output, sparse_output, rtol=1e-3, atol=1e-3)
|
||||
|
||||
return {
|
||||
"test_function": "tensor",
|
||||
"m": m,
|
||||
"k": k,
|
||||
"n": n,
|
||||
"dtype": str(dtype),
|
||||
"backend": backend,
|
||||
"sparse_latency (ms)": sparse_measurement.median * 1000,
|
||||
"dense_latency (ms)": dense_measurement.median * 1000,
|
||||
"speedup (d/s)": dense_measurement.median / sparse_measurement.median,
|
||||
"correct": correct,
|
||||
"contiguous": sparse_output.is_contiguous(),
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
dtype_lookup = {
|
||||
"int8": torch.int8,
|
||||
"fp16": torch.float16,
|
||||
"bf16": torch.bfloat16,
|
||||
"fp32": torch.float32,
|
||||
}
|
||||
|
||||
parser = argparse.ArgumentParser(description="Semi-Structured Sparsity Benchmarks")
|
||||
parser.add_argument(
|
||||
"--mode",
|
||||
type=str,
|
||||
choices=[
|
||||
"nvidia-bert",
|
||||
"nvidia-fixed-k",
|
||||
"nvidia-fixed-mn",
|
||||
],
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dtype",
|
||||
type=str,
|
||||
choices=dtype_lookup.keys(),
|
||||
default="fp16",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--backend", type=str, choices=["cutlass", "cusparselt"], default="cusparselt"
|
||||
)
|
||||
parser.add_argument("-contiguous", action="store_true")
|
||||
parser.add_argument("-e2e", action="store_true")
|
||||
parser.add_argument("-save", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.e2e:
|
||||
eval_fn = test_linear
|
||||
else:
|
||||
eval_fn = test_tensor
|
||||
|
||||
print(f"Started benchmark: {args.mode} | dtype: {args.dtype}")
|
||||
dtype = dtype_lookup[args.dtype]
|
||||
|
||||
if args.mode == "nvidia-bert":
|
||||
bert_shapes = [
|
||||
(3072, 1024, 16384),
|
||||
(4096, 1024, 16384),
|
||||
(1024, 1024, 16384),
|
||||
(1024, 4096, 16384),
|
||||
]
|
||||
results = (
|
||||
eval_fn(m, k, n, dtype, args.contiguous, args.backend)
|
||||
for (m, k, n) in tqdm(bert_shapes)
|
||||
)
|
||||
|
||||
elif args.mode == "nvidia-fixed-k":
|
||||
mn_vals = [
|
||||
3072,
|
||||
4096,
|
||||
5120,
|
||||
6144,
|
||||
7168,
|
||||
8192,
|
||||
9216,
|
||||
10240,
|
||||
11264,
|
||||
12288,
|
||||
13312,
|
||||
14336,
|
||||
15360,
|
||||
16384,
|
||||
17408,
|
||||
18432,
|
||||
19456,
|
||||
20480,
|
||||
]
|
||||
results = (
|
||||
eval_fn(mn, 10240, mn, dtype, args.contiguous, args.backend)
|
||||
for mn in tqdm(mn_vals)
|
||||
)
|
||||
|
||||
elif args.mode == "nvidia-fixed-mn":
|
||||
k_vals = [
|
||||
2560,
|
||||
3840,
|
||||
5120,
|
||||
6400,
|
||||
7680,
|
||||
8960,
|
||||
10240,
|
||||
11520,
|
||||
12800,
|
||||
14080,
|
||||
15360,
|
||||
16640,
|
||||
17920,
|
||||
19200,
|
||||
20480,
|
||||
]
|
||||
results = (
|
||||
eval_fn(10240, k, 10240, dtype, args.contiguous, args.backend)
|
||||
for k in tqdm(k_vals)
|
||||
)
|
||||
|
||||
df = pd.DataFrame.from_records(results)
|
||||
if args.save:
|
||||
save_file = f"{args.mode}_{args.dtype}_{args.backend}.csv"
|
||||
df.to_csv(save_file)
|
||||
print(f"Finished benchmark: {args.mode} saved results to {save_file}")
|
||||
print(df)
|
||||
@ -87,8 +87,6 @@ void reportOutOfMemoryToProfiler(
|
||||
}
|
||||
}
|
||||
|
||||
MemoryReportingInfoBase::MemoryReportingInfoBase() = default;
|
||||
|
||||
void MemoryReportingInfoBase::reportOutOfMemory(
|
||||
int64_t /*alloc_size*/,
|
||||
size_t /*total_allocated*/,
|
||||
|
||||
@ -157,6 +157,7 @@ inline bool operator!=(std::nullptr_t, const DataPtr& dp) noexcept {
|
||||
// possible, or the raw interface will incorrectly reported as unsupported,
|
||||
// when it is actually possible.
|
||||
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
|
||||
struct C10_API Allocator {
|
||||
virtual ~Allocator() = default;
|
||||
|
||||
@ -223,10 +224,24 @@ struct C10_API Allocator {
|
||||
// allocation InefficientStdFunctionContext, on top of the dynamic
|
||||
// allocation which is implied by std::function itself.
|
||||
struct C10_API InefficientStdFunctionContext {
|
||||
void* ptr_;
|
||||
void* ptr_{nullptr};
|
||||
std::function<void(void*)> deleter_;
|
||||
InefficientStdFunctionContext(void* ptr, std::function<void(void*)> deleter)
|
||||
: ptr_(ptr), deleter_(std::move(deleter)) {}
|
||||
InefficientStdFunctionContext(const InefficientStdFunctionContext&) = delete;
|
||||
InefficientStdFunctionContext(InefficientStdFunctionContext&& rhs) noexcept
|
||||
: ptr_(std::exchange(rhs.ptr_, nullptr)),
|
||||
deleter_(std::move(rhs.deleter_)) {}
|
||||
InefficientStdFunctionContext& operator=(
|
||||
const InefficientStdFunctionContext&) = delete;
|
||||
// NOLINTNEXTLINE(performance-noexcept-move-constructor)
|
||||
InefficientStdFunctionContext& operator=(
|
||||
InefficientStdFunctionContext&& rhs) {
|
||||
this->~InefficientStdFunctionContext();
|
||||
ptr_ = std::exchange(rhs.ptr_, nullptr);
|
||||
deleter_ = std::move(rhs.deleter_);
|
||||
return *this;
|
||||
}
|
||||
~InefficientStdFunctionContext() {
|
||||
if (deleter_) {
|
||||
deleter_(ptr_);
|
||||
@ -270,9 +285,6 @@ struct AllocatorRegisterer {
|
||||
// An interface for reporting thread local memory usage
|
||||
// per device
|
||||
struct C10_API MemoryReportingInfoBase : public c10::DebugInfoBase {
|
||||
MemoryReportingInfoBase();
|
||||
~MemoryReportingInfoBase() override = default;
|
||||
|
||||
/**
|
||||
* alloc_size corresponds to the size of the ptr.
|
||||
*
|
||||
@ -312,6 +324,7 @@ C10_API void reportOutOfMemoryToProfiler(
|
||||
Device device);
|
||||
|
||||
// used to hold traceback information in allocators
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
|
||||
struct GatheredContext {
|
||||
virtual ~GatheredContext() = default;
|
||||
};
|
||||
|
||||
@ -75,9 +75,6 @@ ProfiledCPUMemoryReporter& profiledCPUMemoryReporter() {
|
||||
template <uint32_t PreGuardBytes, uint32_t PostGuardBytes>
|
||||
class DefaultMobileCPUAllocator final : public at::Allocator {
|
||||
public:
|
||||
DefaultMobileCPUAllocator() = default;
|
||||
~DefaultMobileCPUAllocator() override = default;
|
||||
|
||||
static void deleter(void* const pointer) {
|
||||
if (C10_UNLIKELY(!pointer)) {
|
||||
return;
|
||||
|
||||
@ -34,6 +34,8 @@ class DeviceGuard {
|
||||
const impl::DeviceGuardImplInterface* impl)
|
||||
: guard_(device, impl) {}
|
||||
|
||||
~DeviceGuard() = default;
|
||||
|
||||
/// Copy is disallowed
|
||||
DeviceGuard(const DeviceGuard&) = delete;
|
||||
DeviceGuard& operator=(const DeviceGuard&) = delete;
|
||||
@ -143,6 +145,7 @@ class OptionalDeviceGuard {
|
||||
const impl::DeviceGuardImplInterface* impl)
|
||||
: guard_(device, impl) {}
|
||||
|
||||
~OptionalDeviceGuard() = default;
|
||||
/// Copy is disallowed
|
||||
OptionalDeviceGuard(const OptionalDeviceGuard&) = delete;
|
||||
OptionalDeviceGuard& operator=(const OptionalDeviceGuard&) = delete;
|
||||
|
||||
@ -61,6 +61,7 @@ struct C10_API GeneratorImpl : public c10::intrusive_ptr_target {
|
||||
GeneratorImpl(const GeneratorImpl& other) = delete;
|
||||
GeneratorImpl(GeneratorImpl&& other) = delete;
|
||||
GeneratorImpl& operator=(const GeneratorImpl& other) = delete;
|
||||
GeneratorImpl& operator=(GeneratorImpl&& other) = delete;
|
||||
|
||||
~GeneratorImpl() override = default;
|
||||
c10::intrusive_ptr<GeneratorImpl> clone() const;
|
||||
|
||||
@ -16,6 +16,10 @@ struct C10_API AutoGradMode {
|
||||
AutoGradMode(bool enabled) : prev_mode(GradMode::is_enabled()) {
|
||||
GradMode::set_enabled(enabled);
|
||||
}
|
||||
AutoGradMode(const AutoGradMode&) = delete;
|
||||
AutoGradMode(AutoGradMode&&) = delete;
|
||||
AutoGradMode& operator=(const AutoGradMode&) = delete;
|
||||
AutoGradMode& operator=(AutoGradMode&&) = delete;
|
||||
~AutoGradMode() {
|
||||
GradMode::set_enabled(prev_mode);
|
||||
}
|
||||
@ -35,6 +39,10 @@ struct C10_API AutoFwGradMode {
|
||||
: prev_mode(AutogradState::get_tls_state().get_fw_grad_mode()) {
|
||||
AutogradState::get_tls_state().set_fw_grad_mode(enabled);
|
||||
}
|
||||
AutoFwGradMode(const AutoFwGradMode&) = delete;
|
||||
AutoFwGradMode(AutoFwGradMode&&) = delete;
|
||||
AutoFwGradMode& operator=(const AutoFwGradMode&) = delete;
|
||||
AutoFwGradMode& operator=(AutoFwGradMode&&) = delete;
|
||||
~AutoFwGradMode() {
|
||||
AutogradState::get_tls_state().set_fw_grad_mode(prev_mode);
|
||||
}
|
||||
|
||||
@ -73,6 +73,11 @@ struct C10_API InferenceMode {
|
||||
c10::impl::_force_tls_local_dispatch_key_set(cur_keyset);
|
||||
}
|
||||
|
||||
InferenceMode(const InferenceMode&) = delete;
|
||||
InferenceMode(InferenceMode&&) = delete;
|
||||
InferenceMode& operator=(const InferenceMode&) = delete;
|
||||
InferenceMode& operator=(InferenceMode&&) = delete;
|
||||
|
||||
~InferenceMode() {
|
||||
AutogradState::set_tls_state(prev_mode);
|
||||
c10::impl::_force_tls_local_dispatch_key_set(prev_keyset);
|
||||
|
||||
@ -81,9 +81,11 @@ template <typename T>
|
||||
struct SafePyObjectT : private SafePyObject {
|
||||
SafePyObjectT(PyObject* data, c10::impl::PyInterpreter* pyinterpreter)
|
||||
: SafePyObject(data, pyinterpreter) {}
|
||||
~SafePyObjectT() = default;
|
||||
SafePyObjectT(SafePyObjectT&& other) noexcept : SafePyObject(other) {}
|
||||
SafePyObjectT(SafePyObjectT const&) = delete;
|
||||
SafePyObjectT& operator=(SafePyObjectT const&) = delete;
|
||||
SafePyObjectT& operator=(SafePyObjectT&&) = delete;
|
||||
|
||||
using SafePyObject::ptr;
|
||||
using SafePyObject::pyinterpreter;
|
||||
|
||||
@ -23,7 +23,7 @@ C10_API void warnDeprecatedDataPtr();
|
||||
// Currently used only for storing a custom error message
|
||||
// used when throwing an exception when data_ptr is accessed.
|
||||
struct C10_API StorageExtraMeta {
|
||||
c10::optional<std::string> custom_data_ptr_error_msg_ = c10::nullopt;
|
||||
std::optional<std::string> custom_data_ptr_error_msg_ = c10::nullopt;
|
||||
StorageExtraMeta() = default;
|
||||
StorageExtraMeta(const StorageExtraMeta& other) {
|
||||
if (other.custom_data_ptr_error_msg_) {
|
||||
@ -283,7 +283,7 @@ struct C10_API StorageImpl : public c10::intrusive_ptr_target {
|
||||
[[noreturn]] void throw_data_ptr_access_error() const;
|
||||
|
||||
void release_data_and_set_meta_custom_data_ptr_error_msg_(
|
||||
c10::optional<std::string> s) {
|
||||
std::optional<std::string> s) {
|
||||
throw_on_immutable_data_ptr_ = true;
|
||||
get_extra_meta().custom_data_ptr_error_msg_ = std::move(s);
|
||||
refresh_has_data_ptr_check();
|
||||
|
||||
@ -27,6 +27,7 @@ namespace c10 {
|
||||
struct StreamGuard {
|
||||
/// No default constructor, see Note [Omitted default constructor from RAII]
|
||||
explicit StreamGuard() = delete;
|
||||
~StreamGuard() = default;
|
||||
|
||||
/// Set the current device to the device associated with the passed stream,
|
||||
/// and set the current stream on that device to the passed stream.
|
||||
@ -111,6 +112,7 @@ struct OptionalStreamGuard {
|
||||
|
||||
// See Note [Move assignment for RAII guards is tricky]
|
||||
OptionalStreamGuard& operator=(OptionalStreamGuard&& other) = delete;
|
||||
~OptionalStreamGuard() = default;
|
||||
|
||||
/// Resets the currently set stream to the original stream and
|
||||
/// the currently set device to the original device. Then,
|
||||
@ -162,6 +164,7 @@ struct MultiStreamGuard {
|
||||
|
||||
// See Note [Move assignment for RAII guards is tricky]
|
||||
MultiStreamGuard& operator=(MultiStreamGuard&& other) = delete;
|
||||
~MultiStreamGuard() = default;
|
||||
|
||||
private:
|
||||
c10::impl::InlineMultiStreamGuard<impl::VirtualGuardImpl> guard_;
|
||||
|
||||
@ -22,7 +22,9 @@ class C10_API SymbolicShapeMeta {
|
||||
bool strides_valid_ = true; // e.g. for sparse where there are no strides
|
||||
|
||||
SymbolicShapeMeta() = default;
|
||||
~SymbolicShapeMeta() = default;
|
||||
SymbolicShapeMeta(const SymbolicShapeMeta& other);
|
||||
SymbolicShapeMeta(SymbolicShapeMeta&& other) = delete;
|
||||
SymbolicShapeMeta& operator=(const SymbolicShapeMeta& other) = delete;
|
||||
SymbolicShapeMeta& operator=(SymbolicShapeMeta&& other) = delete;
|
||||
|
||||
|
||||
@ -133,6 +133,7 @@ struct C10_API PlacementDeleteContext {
|
||||
DataPtr data_ptr_;
|
||||
PlacementDtor placement_dtor_;
|
||||
size_t size_;
|
||||
|
||||
PlacementDeleteContext(
|
||||
DataPtr&& data_ptr,
|
||||
PlacementDtor placement_dtor,
|
||||
@ -140,6 +141,11 @@ struct C10_API PlacementDeleteContext {
|
||||
: data_ptr_(std::move(data_ptr)),
|
||||
placement_dtor_(placement_dtor),
|
||||
size_(size) {}
|
||||
|
||||
PlacementDeleteContext(PlacementDeleteContext&&) noexcept = delete;
|
||||
PlacementDeleteContext(const PlacementDeleteContext&) = delete;
|
||||
PlacementDeleteContext& operator=(const PlacementDeleteContext&) = delete;
|
||||
PlacementDeleteContext& operator=(PlacementDeleteContext&&) = delete;
|
||||
static DataPtr makeDataPtr(
|
||||
DataPtr&& data_ptr,
|
||||
PlacementDtor placement_dtor,
|
||||
@ -237,6 +243,7 @@ struct C10_API ExtraMeta {
|
||||
std::optional<std::string> custom_storage_error_msg_ = std::nullopt;
|
||||
|
||||
ExtraMeta() = default;
|
||||
~ExtraMeta() = default;
|
||||
ExtraMeta(const ExtraMeta& other) {
|
||||
if (other.symbolic_shape_meta_) {
|
||||
symbolic_shape_meta_ =
|
||||
|
||||
@ -62,7 +62,7 @@ class InlineDeviceGuard {
|
||||
// DeviceGuard which reads the current device and promises to
|
||||
// restore to that device on exit. However, most cases where you
|
||||
// would have written this, you probably meant to actually just
|
||||
// use OptionalDeviceGuard (since you don't actually need the
|
||||
// use DeviceGuard (since you don't actually need the
|
||||
// restore to happen if you don't ever actually set the device).
|
||||
// We remove the constructor here to encourage you to think about
|
||||
// what you actually want to happen.
|
||||
@ -221,6 +221,7 @@ class InlineOptionalDeviceGuard {
|
||||
explicit InlineOptionalDeviceGuard()
|
||||
: guard_() // See Note [Explicit initialization of optional fields]
|
||||
{}
|
||||
~InlineOptionalDeviceGuard() = default;
|
||||
|
||||
/// Set the current device to the passed Device, if it is not nullopt.
|
||||
explicit InlineOptionalDeviceGuard(std::optional<Device> device_opt)
|
||||
@ -286,6 +287,7 @@ class InlineOptionalDeviceGuard {
|
||||
// It's in principle possible to raise an error when this occurs
|
||||
// by doing some extra thread-local bookkeeping. But why bother?
|
||||
// Just don't provide the constructor.
|
||||
InlineOptionalDeviceGuard(const InlineOptionalDeviceGuard<T>& other) = delete;
|
||||
InlineOptionalDeviceGuard(InlineOptionalDeviceGuard<T>&& other) = delete;
|
||||
|
||||
// Note [Move assignment for RAII guards is tricky]
|
||||
@ -335,6 +337,8 @@ class InlineOptionalDeviceGuard {
|
||||
//
|
||||
// We could solve this with an extra thread-local variable. But no one is
|
||||
// actually using move-assignment. So just get rid of it.
|
||||
InlineOptionalDeviceGuard& operator=(const InlineOptionalDeviceGuard& other) =
|
||||
delete;
|
||||
InlineOptionalDeviceGuard& operator=(InlineOptionalDeviceGuard&& other) =
|
||||
delete;
|
||||
|
||||
|
||||
@ -135,6 +135,7 @@ class InlineOptionalStreamGuard {
|
||||
explicit InlineOptionalStreamGuard()
|
||||
: guard_() // See Note [Explicit initialization of optional fields]
|
||||
{}
|
||||
~InlineOptionalStreamGuard() = default;
|
||||
|
||||
/// Set the current device to the device associated with the passed stream,
|
||||
/// and set the current stream on that device to the passed stream,
|
||||
@ -151,6 +152,9 @@ class InlineOptionalStreamGuard {
|
||||
explicit InlineOptionalStreamGuard(Args&&... args)
|
||||
: guard_(std::in_place, std::forward<Args>(args)...) {}
|
||||
|
||||
InlineOptionalStreamGuard(const InlineOptionalStreamGuard<T>& other) = delete;
|
||||
InlineOptionalStreamGuard& operator=(const InlineOptionalStreamGuard& other) =
|
||||
delete;
|
||||
// See Note [Move construction for RAII guards is tricky]
|
||||
InlineOptionalStreamGuard(InlineOptionalStreamGuard<T>&& other) = delete;
|
||||
|
||||
|
||||
@ -132,6 +132,11 @@ struct C10_API ForceDispatchKeyGuard {
|
||||
updated_set.excluded_ = exclude;
|
||||
c10::impl::_force_tls_local_dispatch_key_set(updated_set);
|
||||
}
|
||||
|
||||
ForceDispatchKeyGuard(ForceDispatchKeyGuard&&) noexcept = delete;
|
||||
ForceDispatchKeyGuard(const ForceDispatchKeyGuard&) = delete;
|
||||
ForceDispatchKeyGuard& operator=(const ForceDispatchKeyGuard&) = delete;
|
||||
ForceDispatchKeyGuard& operator=(ForceDispatchKeyGuard&&) = delete;
|
||||
~ForceDispatchKeyGuard() {
|
||||
c10::impl::_force_tls_local_dispatch_key_set(saved_keyset_);
|
||||
}
|
||||
|
||||
@ -15,6 +15,7 @@ struct C10_API DisablePythonDispatcher {
|
||||
DisablePythonDispatcher() : old_(PythonDispatcherTLS::get_state()) {
|
||||
PythonDispatcherTLS::set_state({});
|
||||
}
|
||||
|
||||
~DisablePythonDispatcher() {
|
||||
PythonDispatcherTLS::set_state(old_);
|
||||
}
|
||||
|
||||
@ -2016,6 +2016,13 @@ class DeviceCachingAllocator {
|
||||
}
|
||||
}
|
||||
|
||||
void ensureExistsAndIncrefPool(MempoolId_t mempool_id) {
|
||||
// Create a PrivatePool object if it does not exist yet
|
||||
// and increment its use_count
|
||||
std::lock_guard<std::recursive_mutex> lock(mutex);
|
||||
ensure_exists_and_incref_pool(mempool_id);
|
||||
}
|
||||
|
||||
// See Note [Interaction with CUDA graph capture]
|
||||
|
||||
// Called by CUDAGraph::capture_begin
|
||||
@ -2023,18 +2030,7 @@ class DeviceCachingAllocator {
|
||||
MempoolId_t mempool_id,
|
||||
std::function<bool(cudaStream_t)> filter) {
|
||||
std::lock_guard<std::recursive_mutex> lock(mutex);
|
||||
auto it = graph_pools.find(mempool_id);
|
||||
if (it == graph_pools.end()) {
|
||||
// mempool_id does not reference an existing pool. Make a new pool for
|
||||
// this capture.
|
||||
graph_pools.emplace(mempool_id, std::make_unique<PrivatePool>());
|
||||
} else {
|
||||
// mempool_id references an existing pool, which the current capture will
|
||||
// share. Check this pool is live (at least one other capture already
|
||||
// references it).
|
||||
TORCH_INTERNAL_ASSERT(it->second->use_count > 0);
|
||||
it->second->use_count++;
|
||||
}
|
||||
ensure_exists_and_incref_pool(mempool_id);
|
||||
for (auto it2 = captures_underway.begin(); it2 != captures_underway.end();
|
||||
++it2) {
|
||||
TORCH_CHECK(
|
||||
@ -2058,7 +2054,7 @@ class DeviceCachingAllocator {
|
||||
false, "endAllocatePool: not currently recording to mempool_id");
|
||||
}
|
||||
|
||||
// Called by CUDAGraph::reset
|
||||
// Called by CUDAGraph::reset and MemPool::~MemPool()
|
||||
void releasePool(MempoolId_t mempool_id) {
|
||||
std::lock_guard<std::recursive_mutex> lock(mutex);
|
||||
// The instantiated cudaGraphExec_t has been destroyed. We can't blindly
|
||||
@ -2070,20 +2066,24 @@ class DeviceCachingAllocator {
|
||||
// mempool. When the count reaches 0, we tell free_cached_blocks it may now
|
||||
// cudaFree blocks from this graph's pool when it discovers they're unused
|
||||
// (unsplit).
|
||||
auto it = graph_pools.find(mempool_id);
|
||||
TORCH_INTERNAL_ASSERT(it != graph_pools.end());
|
||||
auto uc = --(it->second->use_count);
|
||||
auto pp = get_private_pool(mempool_id);
|
||||
auto uc = --(pp->use_count);
|
||||
TORCH_INTERNAL_ASSERT(uc >= 0);
|
||||
if (uc == 0) {
|
||||
// Allows free_cached_blocks to begin cudaFreeing this pool's memory,
|
||||
// and makes sure this pool wasn't somehow made freeable already.
|
||||
// NOLINTNEXTLINE(clang-analyzer-deadcode.DeadStores)
|
||||
bool inserted =
|
||||
graph_pools_freeable.insert({mempool_id, it->second.get()}).second;
|
||||
bool inserted = graph_pools_freeable.insert({mempool_id, pp}).second;
|
||||
TORCH_INTERNAL_ASSERT(inserted);
|
||||
}
|
||||
}
|
||||
|
||||
int getPoolUseCount(MempoolId_t mempool_id) {
|
||||
std::lock_guard<std::recursive_mutex> lock(mutex);
|
||||
auto pp = get_private_pool(mempool_id);
|
||||
return pp->use_count;
|
||||
}
|
||||
|
||||
void addPeerAccess(c10::DeviceIndex dev_to_access) {
|
||||
std::lock_guard<std::recursive_mutex> lock(mutex);
|
||||
if (std::find(
|
||||
@ -2152,6 +2152,30 @@ class DeviceCachingAllocator {
|
||||
return blocks;
|
||||
}
|
||||
|
||||
void ensure_exists_and_incref_pool(MempoolId_t mempool_id) {
|
||||
auto it = graph_pools.find(mempool_id);
|
||||
if (it == graph_pools.end()) {
|
||||
// mempool_id does not reference an existing pool.
|
||||
// Make a new pool for CUDAGraph capture or torch.cuda.use_mem_pool
|
||||
// usage. use_count is initially 1, which means the pool is
|
||||
// being used since somebody called ensureExistsAndIncrefPool.
|
||||
graph_pools.emplace(mempool_id, std::make_unique<PrivatePool>());
|
||||
} else {
|
||||
// mempool_id references an existing pool, which the current CUDAGraph
|
||||
// capture or torch.cuda.use_mem_pool will
|
||||
// share. Check this pool is live (at least one other capture already
|
||||
// references it). Increment it to establish the usage.
|
||||
TORCH_INTERNAL_ASSERT(it->second->use_count > 0);
|
||||
it->second->use_count++;
|
||||
}
|
||||
}
|
||||
|
||||
PrivatePool* get_private_pool(MempoolId_t mempool_id) {
|
||||
auto it = graph_pools.find(mempool_id);
|
||||
TORCH_INTERNAL_ASSERT(it != graph_pools.end());
|
||||
return it->second.get();
|
||||
}
|
||||
|
||||
// returns the smallest possible address in any segment
|
||||
// where there is enough free address space to fit size
|
||||
// may be composed of free and unmapped segments
|
||||
@ -3536,6 +3560,14 @@ class NativeCachingAllocator : public CUDAAllocator {
|
||||
assertValidDevice(device);
|
||||
device_allocator[device]->resetPeakStats();
|
||||
}
|
||||
|
||||
void ensureExistsAndIncrefPool(
|
||||
c10::DeviceIndex device,
|
||||
MempoolId_t mempool_id) override {
|
||||
assertValidDevice(device);
|
||||
device_allocator[device]->ensureExistsAndIncrefPool(std::move(mempool_id));
|
||||
}
|
||||
|
||||
// CUDAGraph interactions
|
||||
void beginAllocateToPool(
|
||||
c10::DeviceIndex device,
|
||||
@ -3557,6 +3589,12 @@ class NativeCachingAllocator : public CUDAAllocator {
|
||||
device_allocator[device]->releasePool(std::move(mempool_id));
|
||||
}
|
||||
|
||||
int getPoolUseCount(c10::DeviceIndex device, MempoolId_t mempool_id)
|
||||
override {
|
||||
assertValidDevice(device);
|
||||
return device_allocator[device]->getPoolUseCount(std::move(mempool_id));
|
||||
}
|
||||
|
||||
void* raw_alloc(size_t nbytes) override {
|
||||
if (nbytes == 0) {
|
||||
return nullptr;
|
||||
@ -3844,6 +3882,13 @@ MemPool::MemPool(
|
||||
} else {
|
||||
id_ = {uuid_++, 0};
|
||||
}
|
||||
device_ = c10::cuda::current_device();
|
||||
CUDACachingAllocator::ensureExistsAndIncrefPool(device_, id_);
|
||||
}
|
||||
|
||||
MemPool::~MemPool() {
|
||||
TORCH_INTERNAL_ASSERT(use_count() == 1);
|
||||
CUDACachingAllocator::releasePool(device_, id_);
|
||||
}
|
||||
|
||||
MempoolId_t MemPool::id() {
|
||||
@ -3854,6 +3899,17 @@ CUDACachingAllocator::CUDAAllocator* MemPool::allocator() {
|
||||
return allocator_;
|
||||
}
|
||||
|
||||
int MemPool::use_count() {
|
||||
return CUDACachingAllocator::getPoolUseCount(device_, id_);
|
||||
}
|
||||
|
||||
MempoolId_t MemPool::graph_pool_handle(bool is_user_created) {
|
||||
if (is_user_created) {
|
||||
return {0, uid_++};
|
||||
}
|
||||
return {uuid_++, 0};
|
||||
}
|
||||
|
||||
// Note that active_mempool_ is a global variable here
|
||||
// and not inside MemPoolContext class, because in windows we
|
||||
// can't use __declspec(dllexport) and __declspec(thread)
|
||||
|
||||
@ -224,6 +224,22 @@ class CUDAAllocator : public Allocator {
|
||||
c10::DeviceIndex device,
|
||||
MempoolId_t mempool_id) = 0;
|
||||
virtual void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) = 0;
|
||||
virtual int getPoolUseCount(c10::DeviceIndex device, MempoolId_t mempool_id) {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
name(),
|
||||
" does not yet support getPoolUseCount. "
|
||||
"If you need it, please file an issue describing your use case.");
|
||||
}
|
||||
virtual void ensureExistsAndIncrefPool(
|
||||
c10::DeviceIndex device,
|
||||
MempoolId_t mempool_id) {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
name(),
|
||||
" does not yet support ensureExistsAndIncrefPool. "
|
||||
"If you need it, please file an issue describing your use case.");
|
||||
}
|
||||
// returns true if the allocated blocks are equal to expected live allocations
|
||||
virtual bool checkPoolLiveAllocations(
|
||||
c10::DeviceIndex device,
|
||||
@ -427,6 +443,16 @@ inline void attachAllocatorTraceTracker(AllocatorTraceTracker tracker) {
|
||||
inline void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) {
|
||||
return get()->releasePool(device, mempool_id);
|
||||
}
|
||||
inline void ensureExistsAndIncrefPool(
|
||||
c10::DeviceIndex device,
|
||||
MempoolId_t mempool_id) {
|
||||
get()->ensureExistsAndIncrefPool(device, mempool_id);
|
||||
}
|
||||
|
||||
inline int getPoolUseCount(c10::DeviceIndex device, MempoolId_t mempool_id) {
|
||||
return get()->getPoolUseCount(device, mempool_id);
|
||||
}
|
||||
|
||||
// Not part of CUDA_ALLOCATOR_BACKEND_INTERFACE
|
||||
inline std::shared_ptr<void> getIpcDevPtr(std::string handle) {
|
||||
return get()->getIpcDevPtr(std::move(handle));
|
||||
@ -472,9 +498,12 @@ struct C10_CUDA_API MemPool {
|
||||
MemPool(
|
||||
CUDACachingAllocator::CUDAAllocator* allocator = nullptr,
|
||||
bool is_user_created = true);
|
||||
~MemPool();
|
||||
|
||||
MempoolId_t id();
|
||||
CUDACachingAllocator::CUDAAllocator* allocator();
|
||||
int use_count();
|
||||
static MempoolId_t graph_pool_handle(bool is_user_created = true);
|
||||
|
||||
private:
|
||||
static std::atomic<CaptureId_t> uid_;
|
||||
@ -482,6 +511,7 @@ struct C10_CUDA_API MemPool {
|
||||
CUDACachingAllocator::CUDAAllocator* allocator_;
|
||||
bool is_user_created_;
|
||||
MempoolId_t id_;
|
||||
c10::DeviceIndex device_;
|
||||
};
|
||||
|
||||
// MemPoolContext holds the currently active pool and stashes the previous
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user