Compare commits

..

2 Commits

Author SHA1 Message Date
c44f682b27 Properly register parameter for binary folding test
ghstack-source-id: 0a2795dd0aca8063a64285ec4914e785d9c4d99b
Pull Request resolved: https://github.com/pytorch/pytorch/pull/128356
2024-06-10 19:21:01 -07:00
fca5402bcd Mark params static if inlining modules and freezing
ghstack-source-id: 59e87707d5d0f7392346556843f3955e43e8a39d
Pull Request resolved: https://github.com/pytorch/pytorch/pull/128355
2024-06-10 19:21:00 -07:00
358 changed files with 13455 additions and 7451 deletions

View File

@ -91,9 +91,9 @@ _UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
# configuration, so we hardcode everything here rather than do it
# from scratch
case "$image" in
pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9)
CUDA_VERSION=12.4.0
CUDNN_VERSION=9
CUDNN_VERSION=8
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
PROTOBUF=yes
@ -105,9 +105,9 @@ case "$image" in
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9)
pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9)
CUDA_VERSION=12.1.1
CUDNN_VERSION=9
CUDNN_VERSION=8
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
PROTOBUF=yes
@ -119,9 +119,9 @@ case "$image" in
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks)
pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9-inductor-benchmarks)
CUDA_VERSION=12.4.0
CUDNN_VERSION=9
CUDNN_VERSION=8
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
PROTOBUF=yes
@ -134,9 +134,9 @@ case "$image" in
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks)
pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks)
CUDA_VERSION=12.1.1
CUDNN_VERSION=9
CUDNN_VERSION=8
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
PROTOBUF=yes
@ -149,9 +149,9 @@ case "$image" in
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-focal-cuda12.1-cudnn9-py3.12-gcc9-inductor-benchmarks)
pytorch-linux-focal-cuda12.1-cudnn8-py3.12-gcc9-inductor-benchmarks)
CUDA_VERSION=12.1.1
CUDNN_VERSION=9
CUDNN_VERSION=8
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=9
PROTOBUF=yes
@ -164,9 +164,9 @@ case "$image" in
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-focal-cuda12.4-cudnn9-py3.12-gcc9-inductor-benchmarks)
pytorch-linux-focal-cuda12.4-cudnn8-py3.12-gcc9-inductor-benchmarks)
CUDA_VERSION=12.4.0
CUDNN_VERSION=9
CUDNN_VERSION=8
ANACONDA_PYTHON_VERSION=3.12
GCC_VERSION=9
PROTOBUF=yes
@ -179,9 +179,9 @@ case "$image" in
TRITON=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9)
pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9)
CUDA_VERSION=11.8.0
CUDNN_VERSION=9
CUDNN_VERSION=8
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
PROTOBUF=yes
@ -193,9 +193,9 @@ case "$image" in
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9)
CUDA_VERSION=12.4.0
CUDNN_VERSION=9
CUDNN_VERSION=8
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
PROTOBUF=yes
@ -207,9 +207,9 @@ case "$image" in
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9)
pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9)
CUDA_VERSION=12.1.1
CUDNN_VERSION=9
CUDNN_VERSION=8
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
PROTOBUF=yes
@ -221,9 +221,9 @@ case "$image" in
CONDA_CMAKE=yes
TRITON=yes
;;
pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9)
CUDA_VERSION=12.4.0
CUDNN_VERSION=9
CUDNN_VERSION=8
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
PROTOBUF=yes
@ -330,10 +330,10 @@ case "$image" in
DOCS=yes
INDUCTOR_BENCHMARKS=yes
;;
pytorch-linux-jammy-cuda11.8-cudnn9-py3.8-clang12)
pytorch-linux-jammy-cuda11.8-cudnn8-py3.8-clang12)
ANACONDA_PYTHON_VERSION=3.8
CUDA_VERSION=11.8
CUDNN_VERSION=9
CUDNN_VERSION=8
CLANG_VERSION=12
PROTOBUF=yes
DB=yes
@ -380,7 +380,7 @@ case "$image" in
ANACONDA_PYTHON_VERSION=3.9
CONDA_CMAKE=yes
;;
pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter)
pytorch-linux-jammy-cuda11.8-cudnn8-py3.9-linter)
ANACONDA_PYTHON_VERSION=3.9
CUDA_VERSION=11.8
CONDA_CMAKE=yes
@ -447,7 +447,7 @@ tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
#when using cudnn version 8 install it separately from cuda
if [[ "$image" == *cuda* && ${OS} == "ubuntu" ]]; then
IMAGE_NAME="nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
if [[ ${CUDNN_VERSION} == 9 ]]; then
if [[ ${CUDNN_VERSION} == 8 ]]; then
IMAGE_NAME="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
fi
fi
@ -499,7 +499,7 @@ docker build \
"$@" \
.
# NVIDIA dockers for RC releases use tag names like `11.0-cudnn9-devel-ubuntu18.04-rc`,
# NVIDIA dockers for RC releases use tag names like `11.0-cudnn8-devel-ubuntu18.04-rc`,
# for this case we will set UBUNTU_VERSION to `18.04-rc` so that the Dockerfile could
# find the correct image. As a result, here we have to replace the
# "$UBUNTU_VERSION" == "18.04-rc"

View File

@ -1 +1 @@
01cbe5045a6898c9a925f01435c8277b2fe6afcc
bbe6246e37d8aa791c67daaf9d9d61b26c9ccfdc

View File

@ -3,7 +3,7 @@
set -ex
install_ubuntu() {
# NVIDIA dockers for RC releases use tag names like `11.0-cudnn9-devel-ubuntu18.04-rc`,
# NVIDIA dockers for RC releases use tag names like `11.0-cudnn8-devel-ubuntu18.04-rc`,
# for this case we will set UBUNTU_VERSION to `18.04-rc` so that the Dockerfile could
# find the correct image. As a result, here we have to check for
# "$UBUNTU_VERSION" == "18.04"*

View File

@ -1,18 +1,23 @@
#!/bin/bash
if [[ -n "${CUDNN_VERSION}" ]]; then
if [[ ${CUDNN_VERSION} == 8 ]]; then
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
mkdir tmp_cudnn
pushd tmp_cudnn
if [[ ${CUDA_VERSION:0:2} == "12" ]]; then
CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda12-archive"
elif [[ ${CUDA_VERSION:0:2} == "11" ]]; then
CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda11-archive"
if [[ ${CUDA_VERSION:0:4} == "12.4" ]]; then
CUDNN_NAME="cudnn-linux-x86_64-8.9.7.29_cuda12-archive"
curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz
elif [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then
CUDNN_NAME="cudnn-linux-x86_64-8.9.2.26_cuda12-archive"
curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz
elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then
CUDNN_NAME="cudnn-linux-x86_64-8.7.0.84_cuda11-archive"
curl --retry 3 -OLs https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/${CUDNN_NAME}.tar.xz
else
print "Unsupported CUDA version ${CUDA_VERSION}"
exit 1
fi
curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz
tar xf ${CUDNN_NAME}.tar.xz
cp -a ${CUDNN_NAME}/include/* /usr/local/cuda/include/
cp -a ${CUDNN_NAME}/lib/* /usr/local/cuda/lib64/

View File

@ -139,7 +139,7 @@ COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
ARG CUDNN_VERSION
ARG CUDA_VERSION
COPY ./common/install_cudnn.sh install_cudnn.sh
RUN if [ -n "${CUDNN_VERSION}" ]; then bash install_cudnn.sh; fi
RUN if [ "${CUDNN_VERSION}" -eq 8 ]; then bash install_cudnn.sh; fi
RUN rm install_cudnn.sh
# Install CUSPARSELT

View File

@ -368,7 +368,7 @@ test_inductor_cpp_wrapper_abi_compatible() {
echo "Testing Inductor cpp wrapper mode with TORCHINDUCTOR_ABI_COMPATIBLE=1"
# cpu stack allocation causes segfault and needs more investigation
PYTORCH_TESTING_DEVICE_ONLY_FOR="" python test/run_test.py --include inductor/test_cpu_cpp_wrapper
python test/run_test.py --include inductor/test_cpu_cpp_wrapper
python test/run_test.py --include inductor/test_cuda_cpp_wrapper
TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/timm_models.py --device cuda --accuracy --amp \

View File

@ -16,17 +16,6 @@ self-hosted-runner:
- linux.8xlarge.nvidia.gpu
- linux.16xlarge.nvidia.gpu
- linux.g5.4xlarge.nvidia.gpu
# Organization-wide AWS Linux Runners on Linux Foundation account
- lf.linux.large
- lf.linux.2xlarge
- lf.linux.4xlarge
- lf.linux.12xlarge
- lf.linux.24xlarge
- lf.linux.arm64.2xlarge
- lf.linux.4xlarge.nvidia.gpu
- lf.linux.8xlarge.nvidia.gpu
- lf.linux.16xlarge.nvidia.gpu
- lf.linux.g5.4xlarge.nvidia.gpu
# Repo-specific IBM hosted S390x runner
- linux.s390x
# Organization wide AWS Windows runners

View File

@ -8,7 +8,6 @@ ciflow_push_tags:
- ciflow/inductor
- ciflow/inductor-perf-compare
- ciflow/inductor-micro-benchmark
- ciflow/inductor-cu124
- ciflow/linux-aarch64
- ciflow/mps
- ciflow/nightly

View File

@ -19,7 +19,7 @@ CUDA_ARCHES = ["11.8", "12.1", "12.4"]
CUDA_ARCHES_FULL_VERSION = {"11.8": "11.8.0", "12.1": "12.1.1", "12.4": "12.4.0"}
CUDA_ARCHES_CUDNN_VERSION = {"11.8": "9", "12.1": "9", "12.4": "9"}
CUDA_ARCHES_CUDNN_VERSION = {"11.8": "8", "12.1": "8", "12.4": "8"}
ROCM_ARCHES = ["6.0", "6.1"]
@ -42,7 +42,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | " # noqa: B950
"nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
@ -55,7 +55,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | " # noqa: B950
"nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
@ -68,7 +68,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
"nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | "
"nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | "
@ -347,10 +347,6 @@ def generate_wheels_matrix(
for python_version in python_versions:
for arch_version in arches:
gpu_arch_type = arch_type(arch_version)
# Disable py3.12 builds for ROCm because of triton dependency
# on llnl-hatchet, which doesn't have py3.12 wheels available
if gpu_arch_type == "rocm" and python_version == "3.12":
continue
gpu_arch_version = (
""
if arch_version == "cpu"

View File

@ -773,13 +773,13 @@ class TestBypassFailures(TestCase):
# than the one on the base commit. This should still count as broken trunk
"pr_num": 104214,
"related_failure_count": 0,
"flaky_or_broken_trunk": 1,
"unrelated_failure_count": 1,
},
{
# This PR had one broken trunk failure and it used ghstack
"pr_num": 105145,
"related_failure_count": 0,
"flaky_or_broken_trunk": 1,
"unrelated_failure_count": 1,
},
{
# The failure on the merge base was retried successfully and
@ -788,20 +788,20 @@ class TestBypassFailures(TestCase):
# be used to detect broken trunk
"pr_num": 107160,
"related_failure_count": 0,
"flaky_or_broken_trunk": 1,
"unrelated_failure_count": 4,
},
{
# This PR used Dr.CI broken trunk classification
"pr_num": 111253,
"related_failure_count": 1,
"flaky_or_broken_trunk": 1,
"unrelated_failure_count": 2,
},
]
for case in test_cases:
pr_num = case["pr_num"]
related_failure_count = case["related_failure_count"]
flaky_or_broken_trunk = case["flaky_or_broken_trunk"]
unrelated_failure_count = case["unrelated_failure_count"]
pr = GitHubPR("pytorch", "pytorch", pr_num)
checks = pr.get_checkrun_conclusions()
@ -823,7 +823,7 @@ class TestBypassFailures(TestCase):
)
self.assertTrue(len(pending) == 0)
self.assertTrue(
len(failed) == flaky_or_broken_trunk + related_failure_count
len(failed) == unrelated_failure_count + related_failure_count
)
def test_ignore_current(self, *args: Any) -> None:

View File

@ -2027,8 +2027,10 @@ def categorize_checks(
pending_checks: List[Tuple[str, Optional[str], Optional[int]]] = []
failed_checks: List[Tuple[str, Optional[str], Optional[int]]] = []
# failed_checks_categorization is used to keep track of all ignorable failures when saving the merge record on Rockset
failed_checks_categorization: Dict[str, List[Any]] = defaultdict(list)
# ok_failed_checks is used with ok_failed_checks_threshold while ignorable_failed_checks
# is used to keep track of all ignorable failures when saving the merge record on Rockset
ok_failed_checks: List[Tuple[str, Optional[str], Optional[int]]] = []
ignorable_failed_checks: Dict[str, List[Any]] = defaultdict(list)
# If required_checks is not set or empty, consider all names are relevant
relevant_checknames = [
@ -2056,38 +2058,36 @@ def categorize_checks(
continue
elif not is_passing_status(check_runs[checkname].status):
target = (
failed_checks_categorization[classification]
ignorable_failed_checks[classification]
if classification
in ("IGNORE_CURRENT_CHECK", "BROKEN_TRUNK", "FLAKY", "UNSTABLE")
else failed_checks
)
target.append((checkname, url, job_id))
flaky_or_broken_trunk = (
failed_checks_categorization["BROKEN_TRUNK"]
+ failed_checks_categorization["FLAKY"]
)
if classification in ("BROKEN_TRUNK", "FLAKY", "UNSTABLE"):
ok_failed_checks.append((checkname, url, job_id))
if flaky_or_broken_trunk:
if ok_failed_checks:
warn(
f"The following {len(flaky_or_broken_trunk)} checks failed but were likely due flakiness or broken trunk: "
+ ", ".join([x[0] for x in flaky_or_broken_trunk])
f"The following {len(ok_failed_checks)} checks failed but were likely due flakiness or broken trunk: "
+ ", ".join([x[0] for x in ok_failed_checks])
+ (
f" but this is greater than the threshold of {ok_failed_checks_threshold} so merge will fail"
if ok_failed_checks_threshold is not None
and len(flaky_or_broken_trunk) > ok_failed_checks_threshold
and len(ok_failed_checks) > ok_failed_checks_threshold
else ""
)
)
if (
ok_failed_checks_threshold is not None
and len(flaky_or_broken_trunk) > ok_failed_checks_threshold
and len(ok_failed_checks) > ok_failed_checks_threshold
):
failed_checks = failed_checks + flaky_or_broken_trunk
failed_checks = failed_checks + ok_failed_checks
# The list of failed_checks_categorization is returned so that it can be saved into the Rockset merge record
return (pending_checks, failed_checks, failed_checks_categorization)
# The list of ignorable_failed_checks is returned so that it can be saved into the Rockset merge record
return (pending_checks, failed_checks, ignorable_failed_checks)
def merge(

View File

@ -38,19 +38,19 @@ jobs:
matrix:
runner: [linux.12xlarge]
docker-image-name: [
pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9,
pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks,
pytorch-linux-focal-cuda12.4-cudnn9-py3.12-gcc9-inductor-benchmarks,
pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9,
pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks,
pytorch-linux-focal-cuda12.1-cudnn9-py3.12-gcc9-inductor-benchmarks,
pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9,
pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9,
pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9-inductor-benchmarks,
pytorch-linux-focal-cuda12.4-cudnn8-py3.12-gcc9-inductor-benchmarks,
pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9,
pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks,
pytorch-linux-focal-cuda12.1-cudnn8-py3.12-gcc9-inductor-benchmarks,
pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9,
pytorch-linux-focal-py3.8-clang10,
pytorch-linux-focal-py3.11-clang10,
pytorch-linux-focal-py3.12-clang10,
pytorch-linux-focal-rocm-n-1-py3,
pytorch-linux-focal-rocm-n-py3,
pytorch-linux-jammy-cuda11.8-cudnn9-py3.8-clang12,
pytorch-linux-jammy-cuda11.8-cudnn8-py3.8-clang12,
pytorch-linux-focal-py3-clang9-android-ndk-r21e,
pytorch-linux-jammy-py3.8-gcc11,
pytorch-linux-jammy-py3.8-gcc11-inductor-benchmarks,
@ -58,7 +58,7 @@ jobs:
pytorch-linux-jammy-py3-clang15-asan,
pytorch-linux-focal-py3-clang10-onnx,
pytorch-linux-focal-linter,
pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter,
pytorch-linux-jammy-cuda11.8-cudnn8-py3.9-linter,
pytorch-linux-jammy-py3-clang12-executorch
]
include:

View File

@ -149,10 +149,3 @@ jobs:
- name: Teardown Linux
uses: pytorch/test-infra/.github/actions/teardown-linux@main
if: always()
validate:
needs: build
uses: pytorch/builder/.github/workflows/validate-docker-images.yml@main
with:
channel: nightly
ref: main

View File

@ -54,7 +54,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_8-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_8-cpu-aarch64-test: # Testing
@ -162,7 +162,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_9-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cpu-aarch64-test: # Testing
@ -270,7 +270,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_10-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cpu-aarch64-test: # Testing
@ -378,7 +378,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_11-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cpu-aarch64-test: # Testing
@ -486,7 +486,7 @@ jobs:
ALPINE_IMAGE: "arm64v8/alpine"
build_name: manywheel-py3_12-cpu-aarch64
build_environment: linux-aarch64-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cpu-aarch64-test: # Testing

View File

@ -48,7 +48,7 @@ jobs:
DESIRED_PYTHON: "3.8"
build_name: manywheel-py3_8-cuda11_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_8-cuda11_8-test: # Testing
@ -88,7 +88,7 @@ jobs:
DESIRED_PYTHON: "3.8"
build_name: manywheel-py3_8-cuda12_1
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_8-cuda12_1-test: # Testing
@ -128,7 +128,7 @@ jobs:
DESIRED_PYTHON: "3.8"
build_name: manywheel-py3_8-cuda12_4
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_8-cuda12_4-test: # Testing

View File

@ -174,7 +174,7 @@ jobs:
DESIRED_PYTHON: "3.8"
build_name: manywheel-py3_8-cuda11_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_8-cuda11_8-test: # Testing
@ -237,7 +237,7 @@ jobs:
DESIRED_PYTHON: "3.8"
build_name: manywheel-py3_8-cuda12_1
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_8-cuda12_1-test: # Testing
@ -300,7 +300,7 @@ jobs:
DESIRED_PYTHON: "3.8"
build_name: manywheel-py3_8-cuda12_4
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_8-cuda12_4-test: # Testing
@ -690,7 +690,7 @@ jobs:
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-cuda11_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cuda11_8-test: # Testing
@ -753,7 +753,7 @@ jobs:
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-cuda12_1
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cuda12_1-test: # Testing
@ -816,7 +816,7 @@ jobs:
DESIRED_PYTHON: "3.9"
build_name: manywheel-py3_9-cuda12_4
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cuda12_4-test: # Testing
@ -1206,7 +1206,7 @@ jobs:
DESIRED_PYTHON: "3.10"
build_name: manywheel-py3_10-cuda11_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda11_8-test: # Testing
@ -1269,7 +1269,7 @@ jobs:
DESIRED_PYTHON: "3.10"
build_name: manywheel-py3_10-cuda12_1
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda12_1-test: # Testing
@ -1332,7 +1332,7 @@ jobs:
DESIRED_PYTHON: "3.10"
build_name: manywheel-py3_10-cuda12_4
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cuda12_4-test: # Testing
@ -1722,7 +1722,7 @@ jobs:
DESIRED_PYTHON: "3.11"
build_name: manywheel-py3_11-cuda11_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda11_8-test: # Testing
@ -1785,7 +1785,7 @@ jobs:
DESIRED_PYTHON: "3.11"
build_name: manywheel-py3_11-cuda12_1
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda12_1-test: # Testing
@ -1848,7 +1848,7 @@ jobs:
DESIRED_PYTHON: "3.11"
build_name: manywheel-py3_11-cuda12_4
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cuda12_4-test: # Testing
@ -2238,7 +2238,7 @@ jobs:
DESIRED_PYTHON: "3.12"
build_name: manywheel-py3_12-cuda11_8
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda11_8-test: # Testing
@ -2301,7 +2301,7 @@ jobs:
DESIRED_PYTHON: "3.12"
build_name: manywheel-py3_12-cuda12_1
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda12_1-test: # Testing
@ -2364,7 +2364,7 @@ jobs:
DESIRED_PYTHON: "3.12"
build_name: manywheel-py3_12-cuda12_4
build_environment: linux-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cuda12_4-test: # Testing
@ -2410,3 +2410,209 @@ jobs:
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_12-rocm6_0-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
with:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm6.0
GPU_ARCH_VERSION: 6.0
GPU_ARCH_TYPE: rocm
DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
DESIRED_PYTHON: "3.12"
build_name: manywheel-py3_12-rocm6_0
build_environment: linux-binary-manywheel
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-rocm6_0-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs: manywheel-py3_12-rocm6_0-build
runs-on: linux.rocm.gpu
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm6.0
GPU_ARCH_VERSION: 6.0
GPU_ARCH_TYPE: rocm
SKIP_ALL_TESTS: 1
DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
DESIRED_PYTHON: "3.12"
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
- uses: actions/download-artifact@v3
name: Download Build Artifacts
with:
name: manywheel-py3_12-rocm6_0
path: "${{ runner.temp }}/artifacts/"
- name: Checkout PyTorch
uses: malfet/checkout@silent-checkout
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
quiet-checkout: true
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: malfet/checkout@silent-checkout
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
quiet-checkout: true
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: ROCm set GPU_FLAG
run: |
echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
- name: Pull Docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: pytorch/manylinux-builder:rocm6.0-main
- name: Test Pytorch binary
uses: ./pytorch/.github/actions/test-pytorch-binary
- name: Teardown ROCm
uses: ./.github/actions/teardown-rocm
manywheel-py3_12-rocm6_0-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_12-rocm6_0-test
with:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm6.0
GPU_ARCH_VERSION: 6.0
GPU_ARCH_TYPE: rocm
DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main
DESIRED_PYTHON: "3.12"
build_name: manywheel-py3_12-rocm6_0
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
uses: ./.github/workflows/_binary-upload.yml
manywheel-py3_12-rocm6_1-build:
if: ${{ github.repository_owner == 'pytorch' }}
uses: ./.github/workflows/_binary-build-linux.yml
with:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm6.1
GPU_ARCH_VERSION: 6.1
GPU_ARCH_TYPE: rocm
DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
DESIRED_PYTHON: "3.12"
build_name: manywheel-py3_12-rocm6_1
build_environment: linux-binary-manywheel
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-rocm6_1-test: # Testing
if: ${{ github.repository_owner == 'pytorch' }}
needs: manywheel-py3_12-rocm6_1-build
runs-on: linux.rocm.gpu
timeout-minutes: 240
env:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm6.1
GPU_ARCH_VERSION: 6.1
GPU_ARCH_TYPE: rocm
SKIP_ALL_TESTS: 1
DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
DESIRED_PYTHON: "3.12"
steps:
- name: Setup ROCm
uses: ./.github/actions/setup-rocm
- uses: actions/download-artifact@v3
name: Download Build Artifacts
with:
name: manywheel-py3_12-rocm6_1
path: "${{ runner.temp }}/artifacts/"
- name: Checkout PyTorch
uses: malfet/checkout@silent-checkout
with:
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
submodules: recursive
path: pytorch
quiet-checkout: true
- name: Clean PyTorch checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: pytorch
- name: Checkout pytorch/builder
uses: malfet/checkout@silent-checkout
with:
ref: main
submodules: recursive
repository: pytorch/builder
path: builder
quiet-checkout: true
- name: Clean pytorch/builder checkout
run: |
# Remove any artifacts from the previous checkouts
git clean -fxd
working-directory: builder
- name: ROCm set GPU_FLAG
run: |
echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
- name: Pull Docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
with:
docker-image: pytorch/manylinux-builder:rocm6.1-main
- name: Test Pytorch binary
uses: ./pytorch/.github/actions/test-pytorch-binary
- name: Teardown ROCm
uses: ./.github/actions/teardown-rocm
manywheel-py3_12-rocm6_1-upload: # Uploading
if: ${{ github.repository_owner == 'pytorch' }}
permissions:
id-token: write
contents: read
needs: manywheel-py3_12-rocm6_1-test
with:
PYTORCH_ROOT: /pytorch
BUILDER_ROOT: /builder
PACKAGE_TYPE: manywheel
# TODO: This is a legacy variable that we eventually want to get rid of in
# favor of GPU_ARCH_VERSION
DESIRED_CUDA: rocm6.1
GPU_ARCH_VERSION: 6.1
GPU_ARCH_TYPE: rocm
DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main
DESIRED_PYTHON: "3.12"
build_name: manywheel-py3_12-rocm6_1
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
uses: ./.github/workflows/_binary-upload.yml

View File

@ -54,7 +54,7 @@ jobs:
ALPINE_IMAGE: "docker.io/s390x/alpine"
build_name: manywheel-py3_8-cpu-s390x
build_environment: linux-s390x-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_8-cpu-s390x-test: # Testing
@ -117,7 +117,7 @@ jobs:
ALPINE_IMAGE: "docker.io/s390x/alpine"
build_name: manywheel-py3_9-cpu-s390x
build_environment: linux-s390x-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_9-cpu-s390x-test: # Testing
@ -180,7 +180,7 @@ jobs:
ALPINE_IMAGE: "docker.io/s390x/alpine"
build_name: manywheel-py3_10-cpu-s390x
build_environment: linux-s390x-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_10-cpu-s390x-test: # Testing
@ -243,7 +243,7 @@ jobs:
ALPINE_IMAGE: "docker.io/s390x/alpine"
build_name: manywheel-py3_11-cpu-s390x
build_environment: linux-s390x-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_11-cpu-s390x-test: # Testing
@ -306,7 +306,7 @@ jobs:
ALPINE_IMAGE: "docker.io/s390x/alpine"
build_name: manywheel-py3_12-cpu-s390x
build_environment: linux-s390x-binary-manywheel
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
secrets:
github-token: ${{ secrets.GITHUB_TOKEN }}
manywheel-py3_12-cpu-s390x-test: # Testing

View File

@ -46,7 +46,7 @@ jobs:
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.8"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
# For sccache access (only on non-forked PRs)
AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@ -165,7 +165,7 @@ jobs:
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.9"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
# For sccache access (only on non-forked PRs)
AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@ -284,7 +284,7 @@ jobs:
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.10"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
# For sccache access (only on non-forked PRs)
AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@ -403,7 +403,7 @@ jobs:
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.11"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
# For sccache access (only on non-forked PRs)
AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@ -522,7 +522,7 @@ jobs:
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.12"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
# For sccache access (only on non-forked PRs)
AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}

View File

@ -46,7 +46,7 @@ jobs:
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.8"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -290,7 +290,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.8"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -536,7 +536,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.8"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -782,7 +782,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.8"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -1027,7 +1027,7 @@ jobs:
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.9"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -1271,7 +1271,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.9"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -1517,7 +1517,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.9"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -1763,7 +1763,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.9"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -2008,7 +2008,7 @@ jobs:
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.10"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -2252,7 +2252,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.10"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -2498,7 +2498,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.10"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -2744,7 +2744,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.10"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -2989,7 +2989,7 @@ jobs:
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.11"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -3233,7 +3233,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.11"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -3479,7 +3479,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.11"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -3725,7 +3725,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.11"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -3970,7 +3970,7 @@ jobs:
GPU_ARCH_TYPE: cpu
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.12"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -4214,7 +4214,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.12"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -4460,7 +4460,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.12"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash
@ -4706,7 +4706,7 @@ jobs:
GPU_ARCH_TYPE: cuda
SKIP_ALL_TESTS: 1
DESIRED_PYTHON: "3.12"
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
steps:
- name: Display EC2 information
shell: bash

View File

@ -1,108 +0,0 @@
name: inductor-cu124
on:
push:
tags:
- ciflow/inductor-cu124/*
workflow_dispatch:
schedule:
# Run every 4 hours during the week and every 12 hours on the weekend
- cron: 45 0,4,8,12,16,20 * * 1-5
- cron: 45 4,12 * * 0,6
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
cancel-in-progress: true
permissions: read-all
jobs:
linux-focal-cuda12_4-py3_10-gcc9-inductor-build:
# Should be synced with the one in inductor.yml, but this doesn't run inductor_timm
name: cuda12.4-py3.10-gcc9-sm86
uses: ./.github/workflows/_linux-build.yml
with:
sync-tag: linux-focal-cuda12_4-py3_10-gcc9-inductor-build
build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.6'
test-matrix: |
{ include: [
{ config: "inductor", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "inductor_distributed", shard: 1, num_shards: 1, runner: "linux.g5.12xlarge.nvidia.gpu" },
{ config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "inductor_cpp_wrapper_abi_compatible", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
]}
secrets:
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
linux-focal-cuda12_4-py3_10-gcc9-inductor-test:
name: cuda12.4-py3.10-gcc9-sm86
uses: ./.github/workflows/_linux-test.yml
needs: linux-focal-cuda12_4-py3_10-gcc9-inductor-build
with:
sync-tag: linux-focal-cuda12_4-py3_10-gcc9-inductor-test
build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build.outputs.test-matrix }}
secrets:
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
linux-focal-cuda12_4-py3_10-gcc9-inductor-build-gcp:
name: cuda12.4-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm80
docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0'
test-matrix: |
{ include: [
{ config: "inductor_torchbench_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.gcp.a100" },
]}
secrets:
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
linux-focal-cuda12_4-py3_10-gcc9-inductor-test-gcp:
name: cuda12.4-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-test.yml
needs: linux-focal-cuda12_4-py3_10-gcc9-inductor-build-gcp
with:
build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm80
docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build-gcp.outputs.docker-image }}
test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build-gcp.outputs.test-matrix }}
use-gha: anything-non-empty-to-use-gha
secrets:
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
linux-focal-cuda12_4-py3_12-gcc9-inductor-build:
name: cuda12.4-py3.12-gcc9-sm86
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-focal-cuda12.4-py3.12-gcc9-sm86
docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3.12-gcc9-inductor-benchmarks
cuda-arch-list: '8.6'
test-matrix: |
{ include: [
{ config: "inductor", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
]}
linux-focal-cuda12_4-py3_12-gcc9-inductor-test:
name: cuda12.4-py3.12-gcc9-sm86
uses: ./.github/workflows/_linux-test.yml
needs: linux-focal-cuda12_4-py3_12-gcc9-inductor-build
with:
build-environment: linux-focal-cuda12.4-py3.12-gcc9-sm86
docker-image: ${{ needs.linux-focal-cuda12_4-py3_12-gcc9-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-focal-cuda12_4-py3_12-gcc9-inductor-build.outputs.test-matrix }}

View File

@ -21,7 +21,7 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0'
test-matrix: |
{ include: [

View File

@ -18,7 +18,7 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0'
test-matrix: |
{ include: [

View File

@ -71,7 +71,7 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0'
test-matrix: |
{ include: [

View File

@ -23,7 +23,7 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.6'
test-matrix: |
{ include: [
@ -56,3 +56,93 @@ jobs:
test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
secrets:
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
linux-focal-cuda12_4-py3_10-gcc9-inductor-build:
# Should be synced with the one in inductor.yml, but this doesn't run inductor_timm
name: cuda12.4-py3.10-gcc9-sm86
uses: ./.github/workflows/_linux-build.yml
with:
sync-tag: linux-focal-cuda12_4-py3_10-gcc9-inductor-build
build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
docker-image-name: pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.6'
test-matrix: |
{ include: [
{ config: "inductor", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "inductor_distributed", shard: 1, num_shards: 1, runner: "linux.g5.12xlarge.nvidia.gpu" },
{ config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "dynamic_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
{ config: "inductor_cpp_wrapper_abi_compatible", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
]}
secrets:
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
linux-focal-cuda12_4-py3_10-gcc9-inductor-test:
name: cuda12.4-py3.10-gcc9-sm86
uses: ./.github/workflows/_linux-test.yml
needs: linux-focal-cuda12_4-py3_10-gcc9-inductor-build
with:
sync-tag: linux-focal-cuda12_4-py3_10-gcc9-inductor-test
build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build.outputs.test-matrix }}
secrets:
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
linux-focal-cuda12_4-py3_10-gcc9-inductor-build-gcp:
name: cuda12.4-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm80
docker-image-name: pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0'
test-matrix: |
{ include: [
{ config: "inductor_torchbench_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.gcp.a100" },
]}
secrets:
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
linux-focal-cuda12_4-py3_10-gcc9-inductor-test-gcp:
name: cuda12.4-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-test.yml
needs: linux-focal-cuda12_4-py3_10-gcc9-inductor-build-gcp
with:
build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm80
docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build-gcp.outputs.docker-image }}
test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build-gcp.outputs.test-matrix }}
use-gha: anything-non-empty-to-use-gha
secrets:
HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
linux-focal-cuda12_4-py3_12-gcc9-inductor-build:
name: cuda12.4-py3.12-gcc9-sm86
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-focal-cuda12.4-py3.12-gcc9-sm86
docker-image-name: pytorch-linux-focal-cuda12.4-cudnn8-py3.12-gcc9-inductor-benchmarks
cuda-arch-list: '8.6'
test-matrix: |
{ include: [
{ config: "inductor", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
]}
linux-focal-cuda12_4-py3_12-gcc9-inductor-test:
name: cuda12.4-py3.12-gcc9-sm86
uses: ./.github/workflows/_linux-test.yml
needs: linux-focal-cuda12_4-py3_12-gcc9-inductor-build
with:
build-environment: linux-focal-cuda12.4-py3.12-gcc9-sm86
docker-image: ${{ needs.linux-focal-cuda12_4-py3_12-gcc9-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-focal-cuda12_4-py3_12-gcc9-inductor-build.outputs.test-matrix }}

View File

@ -44,7 +44,7 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.6'
test-matrix: |
{ include: [
@ -86,7 +86,7 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0'
test-matrix: |
{ include: [
@ -112,7 +112,7 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-focal-cuda12.1-py3.12-gcc9-sm86
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3.12-gcc9-inductor-benchmarks
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3.12-gcc9-inductor-benchmarks
cuda-arch-list: '8.6'
test-matrix: |
{ include: [
@ -135,7 +135,7 @@ jobs:
with:
sync-tag: linux-focal-cuda12_4-py3_10-gcc9-inductor-build
build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks
docker-image-name: pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.6'
test-matrix: |
{ include: [

View File

@ -20,7 +20,7 @@ jobs:
with:
timeout: 120
runner: linux.2xlarge
docker-image: pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter
docker-image: pytorch-linux-jammy-cuda11.8-cudnn8-py3.9-linter
# NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout
# to run git rev-parse HEAD~:.ci/docker when a new image is needed
fetch-depth: 0
@ -36,7 +36,7 @@ jobs:
with:
timeout: 120
runner: linux.2xlarge
docker-image: pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter
docker-image: pytorch-linux-jammy-cuda11.8-cudnn8-py3.9-linter
# NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout
# to run git rev-parse HEAD~:.ci/docker when a new image is needed
fetch-depth: 0

View File

@ -42,7 +42,7 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-focal-cuda12.1-py3.10-gcc9
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
test-matrix: |
{ include: [
{ config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
@ -65,7 +65,7 @@ jobs:
uses: ./.github/workflows/_linux-build-label.yml
with:
build-environment: linux-focal-cuda12.4-py3.10-gcc9
docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
docker-image-name: pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
@ -120,7 +120,7 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-focal-cuda11.8-py3.9-gcc9
docker-image-name: pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9
docker-image-name: pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9
cuda-arch-list: 8.6
test-matrix: |
{ include: [
@ -142,7 +142,7 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-focal-cuda11.8-py3.10-gcc9-debug
docker-image-name: pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9
docker-image-name: pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9
build-with-debug: true
test-matrix: |
{ include: [

View File

@ -237,7 +237,7 @@ jobs:
uses: ./.github/workflows/_linux-build-label.yml
with:
build-environment: linux-focal-cuda11.8-py3.10-gcc9
docker-image-name: pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9
docker-image-name: pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9
test-matrix: |
{ include: [
{ config: "distributed", shard: 1, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
@ -262,7 +262,7 @@ jobs:
uses: ./.github/workflows/_linux-build-label.yml
with:
build-environment: linux-focal-cuda12.1-py3.10-gcc9
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
@ -297,12 +297,12 @@ jobs:
{ config: "default", shard: 1, num_shards: 1 },
]}
linux-jammy-cuda-11_8-cudnn9-py3_8-clang12-build:
name: linux-jammy-cuda11.8-cudnn9-py3.8-clang12
linux-jammy-cuda-11_8-cudnn8-py3_8-clang12-build:
name: linux-jammy-cuda11.8-cudnn8-py3.8-clang12
uses: ./.github/workflows/_linux-build-label.yml
with:
build-environment: linux-jammy-cuda11.8-cudnn9-py3.8-clang12
docker-image-name: pytorch-linux-jammy-cuda11.8-cudnn9-py3.8-clang12
build-environment: linux-jammy-cuda11.8-cudnn8-py3.8-clang12
docker-image-name: pytorch-linux-jammy-cuda11.8-cudnn8-py3.8-clang12
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 1 },
@ -361,7 +361,7 @@ jobs:
uses: ./.github/workflows/_bazel-build-test.yml
with:
build-environment: linux-focal-cuda12.1-py3.10-gcc9-bazel-test
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
cuda-version: cpu
test-matrix: |
{ include: [
@ -373,7 +373,7 @@ jobs:
uses: ./.github/workflows/_bazel-build-test.yml
with:
build-environment: linux-focal-cuda12.1-py3.10-gcc9-bazel-test
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
cuda-version: "12.1"
test-matrix: |
{ include: [
@ -385,7 +385,7 @@ jobs:
uses: ./.github/workflows/_bazel-build-test.yml
with:
build-environment: linux-focal-cuda12.4-py3.10-gcc9-bazel-test
docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
docker-image-name: pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9
cuda-version: "12.4"
test-matrix: |
{ include: [
@ -447,7 +447,7 @@ jobs:
uses: ./.github/workflows/_linux-build-label.yml
with:
build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
cuda-arch-list: 8.6
test-matrix: |
{ include: [

View File

@ -41,7 +41,7 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-focal-cuda12.1-py3-gcc9-slow-gradcheck
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
cuda-arch-list: 8.6
test-matrix: |
{ include: [
@ -70,7 +70,7 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
cuda-arch-list: 8.6
test-matrix: |
{ include: [

View File

@ -26,7 +26,7 @@ jobs:
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
with:
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
working-directory: pytorch
- name: Use following to pull public copy of the image

View File

@ -16,7 +16,7 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0'
test-matrix: |
{ include: [

View File

@ -39,7 +39,7 @@ jobs:
uses: ./.github/workflows/_linux-build-label.yml
with:
build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
docker-image-name: pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9
cuda-arch-list: 8.6
test-matrix: |
{ include: [
@ -66,7 +66,7 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: libtorch-linux-focal-cuda12.1-py3.7-gcc9
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
build-generates-artifacts: false
runner: linux.4xlarge
test-matrix: |
@ -80,7 +80,7 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-focal-cuda12.1-py3.10-gcc9-no-ops
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 1 },
@ -91,7 +91,7 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: libtorch-linux-focal-cuda12.4-py3.7-gcc9
docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
docker-image-name: pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9
build-generates-artifacts: false
runner: linux.4xlarge
test-matrix: |
@ -105,7 +105,7 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-focal-cuda12.4-py3.10-gcc9-no-ops
docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
docker-image-name: pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 1 },

View File

@ -1532,6 +1532,28 @@ exclude_patterns = [
'torch/distributed/optim/post_localSGD_optimizer.py',
'torch/distributed/optim/utils.py',
'torch/distributed/optim/zero_redundancy_optimizer.py',
'torch/distributed/pipeline/__init__.py',
'torch/distributed/pipeline/sync/__init__.py',
'torch/distributed/pipeline/sync/_balance/__init__.py',
'torch/distributed/pipeline/sync/_balance/blockpartition.py',
'torch/distributed/pipeline/sync/_balance/profile.py',
'torch/distributed/pipeline/sync/batchnorm.py',
'torch/distributed/pipeline/sync/checkpoint.py',
'torch/distributed/pipeline/sync/copy.py',
'torch/distributed/pipeline/sync/dependency.py',
'torch/distributed/pipeline/sync/microbatch.py',
'torch/distributed/pipeline/sync/phony.py',
'torch/distributed/pipeline/sync/pipe.py',
'torch/distributed/pipeline/sync/pipeline.py',
'torch/distributed/pipeline/sync/skip/__init__.py',
'torch/distributed/pipeline/sync/skip/layout.py',
'torch/distributed/pipeline/sync/skip/namespace.py',
'torch/distributed/pipeline/sync/skip/portal.py',
'torch/distributed/pipeline/sync/skip/skippable.py',
'torch/distributed/pipeline/sync/skip/tracker.py',
'torch/distributed/pipeline/sync/stream.py',
'torch/distributed/pipeline/sync/utils.py',
'torch/distributed/pipeline/sync/worker.py',
'torch/distributed/remote_device.py',
'torch/distributed/rendezvous.py',
'torch/distributed/rpc/__init__.py',
@ -1825,6 +1847,8 @@ exclude_patterns = [
'torch/testing/_internal/distributed/nn/__init__.py',
'torch/testing/_internal/distributed/nn/api/__init__.py',
'torch/testing/_internal/distributed/nn/api/remote_module_test.py',
'torch/testing/_internal/distributed/pipe_with_ddp_test.py',
'torch/testing/_internal/distributed/pipeline/__init__.py',
'torch/testing/_internal/distributed/rpc/__init__.py',
'torch/testing/_internal/distributed/rpc/dist_autograd_test.py',
'torch/testing/_internal/distributed/rpc/dist_optimizer_test.py',
@ -2079,7 +2103,7 @@ init_command = [
'python3',
'tools/linter/adapters/pip_init.py',
'--dry-run={{DRYRUN}}',
'ruff==0.4.8',
'ruff==0.4.6',
]
is_formatter = true

View File

@ -213,7 +213,6 @@ conda install -c pytorch magma-cuda121 # or the magma-cuda* that matches your C
# (optional) If using torch.compile with inductor/triton, install the matching version of triton
# Run from the pytorch directory after cloning
# For Intel GPU support, please explicitly `export USE_XPU=1` before running command.
make triton
```

View File

@ -473,6 +473,7 @@ endif()
if(USE_CUDA AND NOT USE_ROCM)
list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/include)
list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/tools/util/include)
if($ENV{ATEN_STATIC_CUDA})
list(APPEND ATen_CUDA_DEPENDENCY_LIBS
${CUDA_LIBRARIES}

View File

@ -68,7 +68,7 @@ thread_local std::array<at::ScalarType, at::COMPILE_TIME_MAX_DEVICE_TYPES>
at::kBFloat16, // XLA / TPU
at::ScalarType::Undefined, // Vulkan
at::ScalarType::Undefined, // Metal
at::kHalf, // XPU
at::kBFloat16, // XPU
at::ScalarType::Undefined, // MPS
at::ScalarType::Undefined, // Meta (tensors with no data)
at::kBFloat16, // HPU / HABANA

View File

@ -4,21 +4,6 @@
#endif
namespace at::cpu {
bool is_cpu_support_avx2() {
#if !defined(__s390x__) && !defined(__powerpc__)
return cpuinfo_initialize() && cpuinfo_has_x86_avx2();
#else
return false;
#endif
}
bool is_cpu_support_avx512() {
#if !defined(__s390x__) && !defined(__powerpc__)
return cpuinfo_initialize() && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512vl() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq();
#else
return false;
#endif
}
bool is_cpu_support_vnni() {
#if !defined(__s390x__) && !defined(__powerpc__)

View File

@ -4,9 +4,6 @@
namespace at::cpu {
TORCH_API bool is_cpu_support_avx2();
TORCH_API bool is_cpu_support_avx512();
// Detect if CPU support Vector Neural Network Instruction.
TORCH_API bool is_cpu_support_vnni();

View File

@ -170,6 +170,43 @@ CUDA_STUB3(cuLinkComplete, CUlinkState, void **, size_t *);
CUDA_STUB3(cuFuncSetAttribute, CUfunction, CUfunction_attribute, int);
CUDA_STUB3(cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction);
#if defined(CUDA_VERSION) && CUDA_VERSION >= 12000
CUresult CUDAAPI
cuTensorMapEncodeTiled(
CUtensorMap* tensorMap,
CUtensorMapDataType tensorDataType,
cuuint32_t tensorRank,
void* globalAddress,
const cuuint64_t* globalDim,
const cuuint64_t* globalStrides,
const cuuint32_t* boxDim,
const cuuint32_t* elementStrides,
CUtensorMapInterleave interleave,
CUtensorMapSwizzle swizzle,
CUtensorMapL2promotion l2Promotion,
CUtensorMapFloatOOBfill oobFill) {
auto fn = reinterpret_cast<decltype(&cuTensorMapEncodeTiled)>(
getCUDALibrary().sym(__func__));
if (!fn)
throw std::runtime_error("Can't get cuTensorMapEncodeTiled");
lazyNVRTC.cuTensorMapEncodeTiled = fn;
return fn(
tensorMap,
tensorDataType,
tensorRank,
globalAddress,
globalDim,
globalStrides,
boxDim,
elementStrides,
interleave,
swizzle,
l2Promotion,
oobFill);
}
#endif
// Irregularly shaped functions
CUresult CUDAAPI cuLaunchKernel(CUfunction f,
unsigned int gridDimX,

View File

@ -34,8 +34,8 @@ struct PhiloxCudaState {
int64_t* ptr;
};
Payload seed_{};
Payload offset_{};
Payload seed_;
Payload offset_;
uint32_t offset_intragraph_ = 0;
bool captured_ = false;
};

View File

@ -59,16 +59,25 @@ namespace at { namespace cuda {
_(cuLinkAddData) \
_(cuLinkComplete) \
_(cuFuncSetAttribute) \
_(cuFuncGetAttribute)
_(cuFuncGetAttribute) \
#if defined(CUDA_VERSION) && CUDA_VERSION >= 12000
#define AT_FORALL_NVRTC_EXTENDED(_) \
AT_FORALL_NVRTC_BASE(_) \
_(cuTensorMapEncodeTiled)
#else
#define AT_FORALL_NVRTC_EXTENDED(_) \
AT_FORALL_NVRTC_BASE(_)
#endif
#if defined(CUDA_VERSION) && CUDA_VERSION >= 11010
#define AT_FORALL_NVRTC(_) \
AT_FORALL_NVRTC_BASE(_) \
AT_FORALL_NVRTC_EXTENDED(_) \
_(nvrtcGetCUBINSize) \
_(nvrtcGetCUBIN)
#else
#define AT_FORALL_NVRTC(_) \
AT_FORALL_NVRTC_BASE(_)
AT_FORALL_NVRTC_EXTENDED(_)
#endif
#else

View File

@ -31,6 +31,46 @@ Tensor index_select_backward_hack(const Tensor& grad, IntArrayRef self_sizes, in
return at::zeros(self_sizes, grad.options()).index_add(dim, index, grad);
}
static optional<std::tuple<Tensor,int64_t>> unwrap(const Tensor& tensor) {
auto* wrapped = maybeGetTensorWrapper(tensor);
if (wrapped) {
if (wrapped->level().has_value()) {
return std::make_tuple(wrapped->value(), *wrapped->level());
}
return unwrap(wrapped->value());
}
auto* batched = maybeGetBatchedImpl(tensor);
if (batched) {
return std::make_tuple(batched->value(), batched->level());
}
return nullopt;
}
static bool can_perform_inplace(const Tensor& a, const Tensor& b) {
// TODO: generalize this to more transforms
auto a_ = unwrap(a);
auto b_ = unwrap(b);
if (!a_.has_value() && b_.has_value()) {
return false;
}
if (!a_.has_value() && !b_.has_value()) {
return true;
}
if (a_.has_value() && !b_.has_value()) {
return true;
}
TORCH_INTERNAL_ASSERT(a_.has_value() && b_.has_value());
// If b has any wrapper that a does not, then we cannot do a.inplace_(b)
if (std::get<1>(*a_) < std::get<1>(*b_)) {
return false;
}
if (std::get<1>(*a_) > std::get<1>(*b_)) {
return can_perform_inplace(std::get<0>(*a_), b);
}
return can_perform_inplace(std::get<0>(*a_), std::get<0>(*b_));
}
// TODO: linear is pretty important for performance, but I'm not sure how to work
// around the in-place.
Tensor linear_hack(const Tensor& input, const Tensor& weight, const std::optional<Tensor>& bias_opt) {

View File

@ -1480,14 +1480,23 @@ Tensor& not_equal_(Tensor& self, const Scalar& other) { return self.ne_(other);
Tensor& logical_and_out(const Tensor& self, const Tensor& other, Tensor& result) { return comparison_op_out(result, self, other, logical_and_stub); }
Tensor logical_and(const Tensor& self, const Tensor& other) { return comparison_op(self, other, static_cast<OutFunc>(at::logical_and_out)); }
Tensor& logical_and_(Tensor& self, const Tensor& other) { return comparison_op_(self, other, static_cast<OutFunc>(at::logical_and_out)); }
static Tensor& logical_and_out(Tensor& result, const Tensor& self, const Scalar& other) { return comparison_op_out(result, self, other, static_cast<OutFunc>(at::logical_and_out)); }
static Tensor logical_and(const Tensor& self, const Scalar& other) { return comparison_op(self, other, static_cast<OutFunc>(at::logical_and_out)); }
static Tensor& logical_and_(Tensor& self, const Scalar& other) { return comparison_op_(self, other, static_cast<OutFunc>(at::logical_and_out)); }
Tensor& logical_or_out(const Tensor& self, const Tensor& other, Tensor& result) { return comparison_op_out(result, self, other, logical_or_stub); }
Tensor logical_or(const Tensor& self, const Tensor& other) { return comparison_op(self, other, static_cast<OutFunc>(at::logical_or_out)); }
Tensor& logical_or_(Tensor& self, const Tensor& other) { return comparison_op_(self, other, static_cast<OutFunc>(at::logical_or_out)); }
static Tensor& logical_or_out(Tensor& result, const Tensor& self, const Scalar& other) { return comparison_op_out(result, self, other, static_cast<OutFunc>(at::logical_or_out)); }
static Tensor logical_or(const Tensor& self, const Scalar& other) { return comparison_op(self, other, static_cast<OutFunc>(at::logical_or_out)); }
static Tensor& logical_or_(Tensor& self, const Scalar& other) { return comparison_op_(self, other, static_cast<OutFunc>(at::logical_or_out)); }
Tensor& logical_xor_out(const Tensor& self, const Tensor& other, Tensor& result) { return comparison_op_out(result, self, other, logical_xor_stub); }
Tensor logical_xor(const Tensor& self, const Tensor& other) { return comparison_op(self, other, static_cast<OutFunc>(at::logical_xor_out)); }
Tensor& logical_xor_(Tensor& self, const Tensor& other) { return comparison_op_(self, other, static_cast<OutFunc>(at::logical_xor_out)); }
static Tensor& logical_xor_out(Tensor& result, const Tensor& self, const Scalar& other) { return comparison_op_out(result, self, other, static_cast<OutFunc>(at::logical_xor_out)); }
static Tensor logical_xor(const Tensor& self, const Scalar& other) { return comparison_op(self, other, static_cast<OutFunc>(at::logical_xor_out)); }
static Tensor& logical_xor_(Tensor& self, const Scalar& other) { return comparison_op_(self, other, static_cast<OutFunc>(at::logical_xor_out)); }
// binary max, alias for maximum
Tensor& max_out(const Tensor& self, const Tensor& other, Tensor& result) {

View File

@ -393,7 +393,7 @@ struct RegisterPRIVATEUSE1Dispatch {
// REGISTER_DISPATCH now dispatches an AVX512 kernel to nullptr but registers other dispatches.
// ALSO_REGISTER_AVX512_DISPATCH should be used for ensuring AVX512 dispatch, among others.
#ifdef CPU_CAPABILITY_AVX512
#define REGISTER_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, ((void*)(fn) ? nullptr : nullptr))
#define REGISTER_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, nullptr)
#else
#define REGISTER_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
#endif

View File

@ -499,4 +499,13 @@ Tensor nll_loss2d_symint(const Tensor & self, const Tensor & target, const std::
return std::get<0>(at::nll_loss2d_forward_symint(self, target, weight, reduction, std::move(ignore_index)));
}
// Duplicate of above code for non-symbolic ints. Kept for BC purposes and to minimize breakages.
static Tensor nll_loss2d(const Tensor & self, const Tensor & target, const std::optional<Tensor>& weight_opt, int64_t reduction, int64_t ignore_index) {
// See [Note: hacky wrapper removal for optional tensor]
c10::MaybeOwned<Tensor> weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt);
const Tensor& weight = *weight_maybe_owned;
return std::get<0>(at::nll_loss2d_forward_symint(self, target, weight, reduction, ignore_index));
}
} // namespace at::native

View File

@ -28,6 +28,18 @@ Tensor empty_meta_symint(
size, dtype_opt, layout_opt, device_opt, pin_memory_opt, memory_format_opt);
}
// Kept only for BC with XLA
static Tensor empty_strided_meta(
IntArrayRef size,
IntArrayRef stride,
std::optional<ScalarType> dtype_opt,
std::optional<Layout> layout_opt,
std::optional<Device> device_opt,
std::optional<bool> pin_memory_opt
) {
return empty_strided_meta_symint(c10::fromIntArrayRefSlow(size), c10::fromIntArrayRefSlow(stride), dtype_opt, layout_opt, device_opt, pin_memory_opt);
}
Tensor empty_strided_meta_symint(
SymIntArrayRef size,
SymIntArrayRef stride,

View File

@ -802,6 +802,55 @@ TORCH_IMPL_FUNC(slow_conv_transpose2d_structured_cpu)
dilation);
}
static std::tuple<Tensor&, Tensor&, Tensor&> slow_conv_transpose2d_backward_out_cpu(const Tensor& grad_output,
const Tensor& input,
const Tensor& weight,
IntArrayRef kernel_size,
IntArrayRef stride,
IntArrayRef padding,
IntArrayRef output_padding,
IntArrayRef dilation,
Tensor& grad_input,
Tensor& grad_weight,
Tensor& grad_bias) {
if (grad_input.defined()) {
slow_conv_transpose2d_backward_out_cpu_template(
input,
grad_output,
grad_input,
weight,
kernel_size,
stride,
padding,
output_padding,
dilation);
}
if (grad_bias.defined()) {
at::sum_out(grad_bias, grad_output, IntArrayRef{0, 2, 3});
}
if (grad_weight.defined()) {
grad_weight.resize_(weight.sizes(), weight.suggest_memory_format());
grad_weight.zero_();
slow_conv_transpose2d_acc_grad_parameters_cpu(
input,
weight,
grad_output,
grad_weight,
grad_bias,
kernel_size,
stride,
padding,
output_padding,
dilation,
1);
}
return std::tuple<Tensor&, Tensor&, Tensor&>(
grad_input, grad_weight, grad_bias);
}
static std::tuple<Tensor, Tensor, Tensor> slow_conv_transpose2d_backward_cpu(
const Tensor& grad_output,
const Tensor& input,

View File

@ -871,6 +871,58 @@ Tensor slow_conv_transpose3d_cpu(
return output;
}
static std::tuple<Tensor&, Tensor&, Tensor&> slow_conv_transpose3d_backward_out_cpu(const Tensor& grad_output,
const Tensor& input,
const Tensor& weight,
IntArrayRef kernel_size,
IntArrayRef stride,
IntArrayRef padding,
IntArrayRef output_padding,
IntArrayRef dilation,
Tensor& grad_input,
Tensor& grad_weight,
Tensor& grad_bias) {
if (grad_input.defined()) {
slow_conv_transpose3d_backward_out_cpu_template(
input,
grad_output,
grad_input,
weight,
kernel_size,
stride,
padding,
output_padding,
dilation);
}
if (grad_weight.defined()) {
grad_weight.resize_(weight.sizes());
grad_weight.zero_();
}
if (grad_bias.defined()) {
grad_bias.resize_({weight.size(1)});
grad_bias.zero_();
}
if (grad_weight.defined() || grad_bias.defined()) {
slow_conv_transpose3d_acc_grad_parameters_cpu(
input,
grad_output,
grad_weight,
grad_bias,
kernel_size,
stride,
padding,
output_padding,
dilation,
1);
}
return std::tuple<Tensor&, Tensor&, Tensor&>(
grad_input, grad_weight, grad_bias);
}
static std::tuple<Tensor, Tensor, Tensor> slow_conv_transpose3d_backward_cpu(
const Tensor& grad_output,
const Tensor& input,

View File

@ -339,6 +339,12 @@ Tensor& gather_out(const Tensor& self, Dimname dim, const Tensor& index, bool sp
Tensor index_add(const Tensor& self, Dimname dim, const Tensor& index, const Tensor& source, const Scalar &alpha) {
reportNYIDimnameOverload("index_add");
}
static Tensor& index_add_(Tensor& self, Dimname dim, const Tensor& index, const Tensor& source, const Scalar &alpha) {
reportNYIDimnameOverload("index_add");
}
static Tensor& index_add_out(const Tensor& self, Dimname dim, const Tensor& index, const Tensor& source, const Scalar& alpha, Tensor& result) {
reportNYIDimnameOverload("index_add");
}
Tensor index_fill(const Tensor& self, Dimname dim, const Tensor& index, const Scalar& source) {
return at::index_fill(self, dimname_to_position(self, dim), index, source);
}
@ -366,12 +372,21 @@ Tensor index_select(const Tensor& self, Dimname dim, const Tensor& index) {
Tensor scatter(const Tensor& self, Dimname dim, const Tensor& index, const Tensor& source) {
reportNYIDimnameOverload("scatter");
}
static Tensor& scatter_(Tensor& self, Dimname dim, const Tensor& index, const Tensor& source) {
reportNYIDimnameOverload("scatter");
}
Tensor scatter(const Tensor& self, Dimname dim, const Tensor& index, const Scalar& source) {
reportNYIDimnameOverload("scatter");
}
static Tensor& scatter_(Tensor& self, Dimname dim, const Tensor& index, const Scalar& source) {
reportNYIDimnameOverload("scatter");
}
Tensor scatter_add(const Tensor& self, Dimname dim, const Tensor& index, const Tensor& source) {
reportNYIDimnameOverload("scatter_add");
}
static Tensor& scatter_add_(Tensor& self, Dimname dim, const Tensor& index, const Tensor& source) {
reportNYIDimnameOverload("scatter_add");
}
std::tuple<Tensor&, Tensor&> sort_out(const Tensor& self, std::optional<bool> stable, Dimname dim, bool keepdim, Tensor& values, Tensor& indices) {
reportNYIDimnameOverload("sort");
}

View File

@ -2276,6 +2276,11 @@ bool cpu_equal(const Tensor& self, const Tensor& other) {
return result.load();
}
static Tensor value_selecting_reduction_backward(const Tensor& grad, int64_t dim, const Tensor& indices, at::IntArrayRef sizes, bool keepdim) {
return at::native::value_selecting_reduction_backward_symint(grad, dim, indices, c10::fromIntArrayRefSlow(sizes), keepdim);
}
// max(dim), min(dim), topk(dim), mode(dim), are examples of reduction
// functions that select values. value_selecting_reduction_backward is the
// backward function for those operators; it propagates the grad to the

View File

@ -301,6 +301,14 @@ void reflection_pad2d_backward_out_template(
} // namespace
// TODO: I tihnk this function should be removed since we implement it with
// TORCH_IMPL_FUNC below
static Tensor& reflection_pad1d_out_cpu(const Tensor& input, IntArrayRef padding,
Tensor& output) {
reflection_pad1d_kernel(kCPU, output, input, padding);
return output;
}
Tensor& reflection_pad1d_out_quantized_cpu(const Tensor& input, IntArrayRef padding,
Tensor& output) {
TORCH_CHECK(input.qscheme() == kPerTensorAffine, "Only per tensor quantization is supported");

View File

@ -231,6 +231,14 @@ TensorImpl* resize_impl_cpu_(
return _resize_impl_(self, size, stride, resize_storage);
}
static TensorImpl* resize_impl_meta_(
TensorImpl* self,
c10::SymIntArrayRef size,
at::OptionalSymIntArrayRef stride,
bool resize_storage = true) {
return _resize_impl_(self, size, stride, resize_storage);
}
template <typename T>
const Tensor& _resize_(
const Tensor& self,

View File

@ -792,6 +792,12 @@ std::tuple<Tensor, Tensor> max(const Tensor& self, Dimname dim, bool keepdim) {
std::tuple<Tensor&, Tensor&> max_out(const Tensor& self, Dimname dim, bool keepdim, Tensor& max, Tensor& max_indices) {
return at::max_out(max, max_indices, self, dimname_to_position(self, dim), keepdim);
}
static Tensor argmax(const Tensor& /*self*/, Dimname /*dim*/, bool /*keepdim*/) {
reportNYIDimnameOverload("argmax");
}
static Tensor argmin(const Tensor& /*self*/, Dimname /*dim*/, bool /*keepdim*/) {
reportNYIDimnameOverload("argmin");
}
Tensor argsort(const Tensor& /*self*/, Dimname /*dim*/, bool /*keepdim*/) {
reportNYIDimnameOverload("argsort");
}

View File

@ -24,6 +24,10 @@
namespace at::native {
static bool is_cuda(const Tensor& self) {
return self.is_cuda();
}
bool is_distributed(const Tensor& self) {
return false;
}
@ -56,6 +60,18 @@ bool is_neg(const Tensor& self) {
return self.is_neg();
}
static bool is_sparse(const Tensor& self) {
return self.is_sparse();
}
static bool is_sparse_csr(const Tensor& self) {
return self.is_sparse_csr();
}
static bool is_quantized(const Tensor& self) {
return self.is_quantized();
}
// True if `self` and `from` have compatible tensor type so that `from`'s
// TensorImpl can be copied to `self`.
bool _has_compatible_shallow_copy_type(const Tensor& self, const Tensor& from) {

View File

@ -1,3 +1,7 @@
#include <cstdint>
#include <c10/util/Exception.h>
#include <c10/core/Scalar.h>
#include <c10/core/ScalarType.h>
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/core/Tensor.h>
#include <ATen/core/NamedTensor.h>
@ -10,6 +14,7 @@
#include <ATen/cuda/tunable/TunableGemm.h>
#include <ATen/native/Resize.h>
#include <c10/util/MaybeOwned.h>
#include <ATen/native/cuda/RowwiseScaledMM.h>
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
@ -819,24 +824,97 @@ static bool _scaled_mm_allowed_device() {
#endif
}
namespace{
enum class ScalingType {
TensorWise,
RowWise,
Error
};
// Validates the scale tensors to scaled_mm
// And returns the type of scaling/which kernel to use
ScalingType get_scaling_type(
const c10::optional<at::Tensor>& scale_a,
const c10::optional<at::Tensor>& scale_b,
int64_t dim_m,
int64_t dim_n) {
TORCH_CHECK(
scale_a.has_value() == scale_b.has_value(),
"Both scale_a and scale_b must be present or absent.");
if (scale_a.has_value()) {
// Both Per-Tensor and Row-wise scaling expect fp32 tensors
TORCH_CHECK(
scale_a->scalar_type() == kFloat && scale_b->scalar_type() == kFloat,
"Both scale_a and scale_b must be float (fp32) tensors.");
// Check the singluar scale case for per-tensor scaling
if (scale_a->numel() == 1 && scale_b->numel() == 1) {
return ScalingType::TensorWise;
} else if (scale_a->dim() == 1 && scale_a->size(0) == dim_m) {
// Check the per-row scaling case
#if !defined(USE_ROCM) && !defined(_MSC_VER) || \
(defined(USE_ROCM) && ROCM_VERSION >= 60000)
TORCH_CHECK(
scale_a->dim() == 1 && scale_b->dim() == 1,
"Both scale_a and scale_b must be 1-dimensional tensors");
TORCH_CHECK(
scale_b->size(0) == dim_n,
"For row-wise scaling, scale_b must have size ",
dim_n,
" but got ",
scale_b->size(0),
".");
TORCH_CHECK(
scale_a->is_contiguous() && scale_b->is_contiguous(),
"Both scale_a and scale_b must be contiguous.");
return ScalingType::RowWise;
#else
TORCH_CHECK(false, "Per-row scaling is not supported for this platform!");
return ScalingType::Error;
#endif // !defined(USE_ROCM) && !defined(_MSC_VER) || (defined(USE_ROCM) &&
// ROCM_VERSION >= 60000)
} else {
TORCH_CHECK(
false,
"For row-wise scaling, scale_a must be size ",
dim_m,
" but got ",
scale_a->numel(),
" and scale_b must be size ",
dim_n,
" but got ",
scale_b->numel(),
".");
// Unreachable
return ScalingType::RowWise;
}
}
return ScalingType::Error;
}
} // namespace
// Computes matrix multiply + bias while applying scaling to input and output matrices and computes amax
// Scales are only applicable when matrices are of Float8 type and assumbed to be equal to 1.0 by default.
// If output matrix type is 16 or 32-bit type, neither scale_result is applied nor amax is computed.
// Known limitations:
// - Only works if mat1 is row-major and mat2 is column-major
// - Only works if matrices sizes are divisible by 32
//
// - If 1-dimensional tensors are used then scale_a should be size = mat1.size(0)
// and scale_b should have size = to mat2.size(1)
// Arguments:
// - `mat1`: the first operand of the matrix multiply, can be type `torch.float8_e4m3fn` or `torch.float8_e5m2`
// - `mat2`: the second operand of the matrix multiply, can be type `torch.float8_e4m3fn` or `torch.float8_e5m2`
// - `bias`: the bias, can be type `torch.float16` or `torch.bfloat16`
// - `out_dtype`: the output dtype, can either be a float8 or a higher precision floating point type
// - `scale_a`: a scalar tensor with the inverse scale of `mat1`, only needed if `mat1` is a float8 type
// - `scale_b`: a scalar tensor with the inverse scale of `mat2`, only needed if `mat2` is a float8 type
// - `scale_result`: a scalar tensor with the scale of the output, only set if the output is a float8 type
// - `scale_a`: a scalar or 1-dimensional tensor with the inverse scale of `mat1`, only needed if `mat1` is a float8 type
// - `scale_b`: a scalar or 1-dimensional tensor with the inverse scale of `mat2`, only needed if `mat2` is a float8 type
// - `scale_result`: a scalar tensor with the scale of the output, only utilized if the output is a float8 type
// - `use_fast_accum`: if true, enables fast float8 accumulation
// - `out`: a reference to the output tensor
// - `amax`: a reference to the amax tensor of the output, only needed if the output is a float8 type and will be updated inplace
// - `amax`: a reference to the amax tensor of the output, only mutated if the output is a float8 type and will be updated inplace
std::tuple<Tensor&, Tensor&>
_scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
@ -855,10 +933,11 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
TORCH_CHECK(
mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
TORCH_CHECK(!scale_a || (scale_a->numel() == 1 && scale_a->scalar_type() == kFloat),
"scale_a must be float scalar");
TORCH_CHECK(!scale_b || (scale_b->numel() == 1 && scale_b->scalar_type() == kFloat),
"scale_b must be a float scalar");
// Check what type of scaling we are doing based on inputs
ScalingType scaling_choice = get_scaling_type(scale_a, scale_b, mat1.size(0), mat2.size(1));
TORCH_INTERNAL_ASSERT(scaling_choice != ScalingType::Error, "Scaling type not supported");
TORCH_CHECK(!scale_result || (scale_result->numel() == 1 && scale_result->scalar_type() == kFloat),
"scale_result must be a float scalar");
TORCH_CHECK(!bias || bias->numel() == mat2.sizes()[1], "Bias must be size ", mat2.sizes()[1],
@ -901,12 +980,26 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
{scale_result_, "scale_result", 7}};
checkAllSameGPU(__func__, targs);
}
// Validation checks have passed lets resize the output to actual size
IntArrayRef mat1_sizes = mat1.sizes();
IntArrayRef mat2_sizes = mat2.sizes();
at::native::resize_output(out, {mat1_sizes[0], mat2_sizes[1]});
at::native::resize_output(amax, {});
// We are doing row-wise scaling
if (scaling_choice == ScalingType::RowWise) {
TORCH_CHECK(out.dtype() == kBFloat16, "Only bf16 high precsion output types are supported for row-wise scaling.");
at::cuda::detail::f8f8bf16_rowwise(
mat1,
mat2,
scale_a.value(),
scale_b.value(),
bias,
use_fast_accum,
out);
return {out, amax};
}
cublasCommonArgs args(mat1, mat2, out);
const auto out_dtype_ = args.result->scalar_type();
TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt");

View File

@ -29,10 +29,18 @@ static inline void maybe_resize_storage_cuda(TensorImpl* self, size_t new_size_b
inline TensorImpl* resize_impl_cuda_(
TensorImpl* self,
IntArrayRef size,
at::OptionalIntArrayRef stride) {
at::OptionalIntArrayRef stride,
bool device_guard = true) {
if (self->sizes() == size && (!stride || self->strides() == stride)) {
return self;
}
// NB: We don't need to hold the device guard when calling from TH
cuda::OptionalCUDAGuard guard;
if (device_guard) {
guard.set_index(self->storage().device().index());
}
const auto itemsize = self->dtype().itemsize();
const auto storage_offset = self->storage_offset();
size_t storage_size = 1;

View File

@ -0,0 +1,536 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/Dispatch.h>
#include <ATen/core/Tensor.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
// Determine if the architecture supports rowwise scaled mm
// Currenlty failing on windows with: https://github.com/NVIDIA/cutlass/issues/1571
#if !defined(USE_ROCM) && !defined(_WIN32) && defined(CUDA_VERSION) && CUDA_VERSION >= 12000
#define BUILD_ROWWISE_FP8_KERNEL
#endif
#if defined(BUILD_ROWWISE_FP8_KERNEL)
// We are going to override the cuTensorMapEncodeTiled driver api with our lazy loader
static CUresult CUDAAPI nvrtc_cuTensorMapEncodeTiled(
CUtensorMap* tensorMap,
CUtensorMapDataType tensorDataType,
cuuint32_t tensorRank,
void* globalAddress,
const cuuint64_t* globalDim,
const cuuint64_t* globalStrides,
const cuuint32_t* boxDim,
const cuuint32_t* elementStrides,
CUtensorMapInterleave interleave,
CUtensorMapSwizzle swizzle,
CUtensorMapL2promotion l2Promotion,
CUtensorMapFloatOOBfill oobFill) {
return at::globalContext().getNVRTC().cuTensorMapEncodeTiled(
tensorMap,
tensorDataType,
tensorRank,
globalAddress,
globalDim,
globalStrides,
boxDim,
elementStrides,
interleave,
swizzle,
l2Promotion,
oobFill);
}
#include <cutlass/core_io.h>
#include <cutlass/cutlass.h>
#include <cutlass/gemm/device/gemm.h>
#include <cutlass/half.h>
#include <cutlass/numeric_types.h>
#include <cutlass/trace.h>
#include <cutlass/util/host_tensor.h>
// Rename the global function symbol
#define cuTensorMapEncodeTiled nvrtc_cuTensorMapEncodeTiled
#include <cute/tensor.hpp>
#undef cuTensorMapEncodeTiled
// Set everything back to normal
#include <cutlass/gemm/collective/collective_builder.hpp>
#include <cutlass/gemm/device/gemm_universal_adapter.h>
#include <cutlass/epilogue/collective/collective_builder.hpp>
#include <cute/atom/mma_atom.hpp>
#include <cutlass/gemm/dispatch_policy.hpp>
#include <cutlass/gemm/kernel/gemm_universal.hpp>
#include <cutlass/util/packed_stride.hpp>
namespace {
// Cutlass rowwise kernel
template <
int TB_M,
int TB_N,
int TB_K,
int TBS_M,
int TBS_N,
int TBS_K,
bool PONG,
bool FAST_ACCUM,
bool USE_BIAS,
typename INPUT_DTYPE,
typename BIAS_DTYPE>
void f8f8bf16_rowwise_impl(
at::Tensor XQ, // FP8
at::Tensor WQ, // FP8
at::Tensor x_scale,
at::Tensor w_scale,
c10::optional<at::Tensor> bias,
at::Tensor out) {
int M = XQ.size(0);
int N = WQ.size(1);
int K = XQ.size(1);
TORCH_CHECK(XQ.is_cuda() && XQ.is_contiguous());
TORCH_CHECK(
WQ.is_cuda() && WQ.ndimension() == 2 && WQ.stride(1) == WQ.size(0) &&
WQ.stride(0) == 1);
// auto Y = at::empty({M, N}, XQ.options().dtype(at::kBFloat16));
using ElementInputA = INPUT_DTYPE;
using LayoutInputA = cutlass::layout::RowMajor;
constexpr int AlignmentInputA = 16 / sizeof(ElementInputA);
using ElementInputB = cutlass::float_e4m3_t;
using LayoutInputB = cutlass::layout::ColumnMajor;
constexpr int AlignmentInputB = 16 / sizeof(ElementInputB);
using ElementBias = BIAS_DTYPE;
using ElementOutput = cutlass::bfloat16_t;
using LayoutOutput = cutlass::layout::RowMajor;
constexpr int AlignmentOutput = 16 / sizeof(ElementOutput);
using ElementAccumulator = float;
using ElementComputeEpilogue = float;
using ArchTag = cutlass::arch::Sm90; // Tag indicating the minimum SM that
// supports the intended feature
using OperatorClass = cutlass::arch::OpClassTensorOp;
using TileShape = cute::Shape<
cute::Int<TB_M>,
cute::Int<TB_N>,
cute::Int<TB_K>>; // Threadblock-level
// tile size
using ClusterShape = cute::Shape<
cute::Int<TBS_M>,
cute::Int<TBS_N>,
cute::Int<TBS_K>>; // Shape of the
// threadblocks in a
// cluster
using KernelSchedule = cutlass::gemm::collective::
KernelScheduleAuto; // Kernel to launch based on the default setting in
// the Collective Builder
// Implement rowwise scaling epilogue.
using XScale = cutlass::epilogue::fusion::Sm90ColBroadcast<
0,
TileShape,
ElementComputeEpilogue,
cute::Stride<cute::Int<1>, cute::Int<0>, cute::Int<0>>>;
using WScale = cutlass::epilogue::fusion::Sm90RowBroadcast<
PONG ? 2 : 1,
TileShape,
ElementComputeEpilogue,
cute::Stride<cute::Int<0>, cute::Int<1>, cute::Int<0>>>;
using Bias = cutlass::epilogue::fusion::Sm90RowBroadcast<
PONG ? 2 : 1,
TileShape,
ElementBias,
cute::Stride<cute::Int<0>, cute::Int<1>, cute::Int<0>>>;
using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
using Compute0 = cutlass::epilogue::fusion::Sm90Compute<
cutlass::multiplies,
ElementComputeEpilogue, // First stage output type.
ElementComputeEpilogue, // First stage input types.
cutlass::FloatRoundStyle::round_to_nearest>;
using EVTCompute0 =
cutlass::epilogue::fusion::Sm90EVT<Compute0, WScale, Accum>;
using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
cutlass::multiplies,
cute::conditional_t< // Second stage output type.
USE_BIAS,
ElementBias,
ElementOutput>,
ElementComputeEpilogue, // Second stage input types.
cutlass::FloatRoundStyle::round_to_nearest>;
using EVTCompute1 =
cutlass::epilogue::fusion::Sm90EVT<Compute1, XScale, EVTCompute0>;
using ComputeBias = cutlass::epilogue::fusion::Sm90Compute<
cutlass::plus,
ElementOutput, // Final (optional) stage output type.
ElementBias, // Final stage input types.
cutlass::FloatRoundStyle::round_to_nearest>;
using EVTComputeBias =
cutlass::epilogue::fusion::Sm90EVT<ComputeBias, Bias, EVTCompute1>;
using EpilogueEVT =
cute::conditional_t<USE_BIAS, EVTComputeBias, EVTCompute1>;
using CollectiveEpilogue =
typename cutlass::epilogue::collective::CollectiveBuilder<
cutlass::arch::Sm90,
cutlass::arch::OpClassTensorOp,
TileShape,
ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator,
ElementComputeEpilogue,
ElementOutput,
LayoutOutput,
AlignmentOutput,
ElementOutput,
LayoutOutput,
AlignmentOutput,
cutlass::epilogue::TmaWarpSpecialized,
EpilogueEVT>::CollectiveOp;
using DefaultSchedule = cutlass::gemm::KernelTmaWarpSpecialized;
using PongSchedule = cutlass::gemm::KernelTmaWarpSpecializedPingpong;
using FastDefaultSchedule =
cutlass::gemm::KernelTmaWarpSpecializedFP8FastAccum;
using FastPongSchedule =
cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
using SlowAccum = cute::conditional_t<PONG, PongSchedule, DefaultSchedule>;
using FastAccum =
cute::conditional_t<PONG, FastPongSchedule, FastDefaultSchedule>;
using MainLoopSchedule =
cute::conditional_t<FAST_ACCUM, FastAccum, SlowAccum>;
using CollectiveMainloop =
typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag,
OperatorClass,
ElementInputA,
LayoutInputA,
AlignmentInputA,
ElementInputB,
LayoutInputB,
AlignmentInputB,
ElementAccumulator,
TileShape,
ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainLoopSchedule>::CollectiveOp;
using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
cute::Shape<int, int, int>,
CollectiveMainloop,
CollectiveEpilogue>;
using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
using StrideInputA = typename Gemm::GemmKernel::StrideA;
using StrideInputB = typename Gemm::GemmKernel::StrideB;
using StrideOutput = typename Gemm::GemmKernel::StrideC;
StrideInputA stride_a = cutlass::make_cute_packed_stride(
StrideInputA{}, cute::make_shape(M, K, 1));
StrideInputB stride_b = cutlass::make_cute_packed_stride(
StrideInputB{}, cute::make_shape(N, K, 1));
StrideOutput stride_output = cutlass::make_cute_packed_stride(
StrideOutput{}, cute::make_shape(M, N, 1));
typename Gemm::Arguments arguments{
cutlass::gemm::GemmUniversalMode::kGemm,
{M, N, K},
{reinterpret_cast<ElementInputA*>(XQ.data_ptr()),
stride_a,
reinterpret_cast<ElementInputB*>(WQ.data_ptr()),
stride_b},
{{}, // Epilogue thread we populate below.
(ElementOutput*)out.data_ptr<at::BFloat16>(),
stride_output,
(ElementOutput*)out.data_ptr<at::BFloat16>(),
stride_output}};
if constexpr (USE_BIAS) {
arguments.epilogue.thread = {
{reinterpret_cast<ElementBias*>(bias.value().data_ptr())}, // bias
// compute_1
{
{reinterpret_cast<ElementComputeEpilogue*>(
x_scale.data_ptr())}, // x_scale
// compute_0
{
{reinterpret_cast<ElementComputeEpilogue*>(
w_scale.data_ptr())}, // w_scale
{}, // Accumulator
{} // Multiplies
},
{}, // Multiplies
},
{}, // Plus
};
} else {
arguments.epilogue.thread = {
{reinterpret_cast<ElementComputeEpilogue*>(
x_scale.data_ptr())}, // x_scale
// compute_0
{
{reinterpret_cast<ElementComputeEpilogue*>(
w_scale.data_ptr())}, // w_scale
{}, // Accumulator
{} // Multiplies
},
{}, // Multiplies
};
}
Gemm gemm;
// Using the arguments, query for extra workspace required for matrix
// multiplication computation
size_t workspace_size = Gemm::get_workspace_size(arguments);
// Allocate workspace memory
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
// Check the problem size is supported or not
cutlass::Status status = gemm.can_implement(arguments);
if (status != cutlass::Status::kSuccess) {
throw std::runtime_error("cutlass cannot implement");
}
// Initialize CUTLASS kernel with arguments and workspace pointer
status = gemm.initialize(arguments, workspace.get());
if (status != cutlass::Status::kSuccess) {
throw std::runtime_error("cutlass cannot initialize");
}
status = gemm(at::cuda::getCurrentCUDAStream());
if (status != cutlass::Status::kSuccess) {
throw std::runtime_error(
std::string("cutlass cannot run") +
cutlass::cutlassGetStatusString(status));
}
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
// FP8 Rowwise Cutlass kernel dispatch.
enum class KernelMode { Small, Large, Default };
KernelMode get_kernel_mode(at::Tensor XQ, at::Tensor WQ) {
auto M = XQ.size(0);
auto K = XQ.size(1);
auto N = WQ.size(0);
// Use a large kernel if at least two shapes are large....
bool use_large_kernel =
((M >= 2048 && K >= 2048) || (M >= 2048 && N >= 2048) ||
(K >= 2048 && N >= 2048));
if (M <= 128 || N <= 128) {
return KernelMode::Small;
} else if (use_large_kernel) {
return KernelMode::Large;
} else {
return KernelMode::Default;
}
}
template <typename InputDType, bool FastAccum, bool UseBias, typename BiasDType>
void dispatch_fp8_rowwise_kernel(
at::Tensor XQ,
at::Tensor WQ,
at::Tensor x_scale,
at::Tensor w_scale,
c10::optional<at::Tensor> bias,
at::Tensor out) {
KernelMode kernel = get_kernel_mode(XQ, WQ);
if (kernel == KernelMode::Small) {
return f8f8bf16_rowwise_impl<
64,
128,
128,
2,
1,
1,
false,
FastAccum,
UseBias,
InputDType,
BiasDType>(XQ, WQ, x_scale, w_scale, bias, out);
} else if (kernel == KernelMode::Large) {
return f8f8bf16_rowwise_impl<
128,
128,
128,
2,
1,
1,
true,
FastAccum,
UseBias,
InputDType,
BiasDType>(XQ, WQ, x_scale, w_scale, bias, out);
} else {
return f8f8bf16_rowwise_impl<
128,
128,
128,
1,
2,
1,
false,
FastAccum,
UseBias,
InputDType,
BiasDType>(XQ, WQ, x_scale, w_scale, bias, out);
}
}
} // namespace
#endif // !defined(USE_ROCM)
namespace at::cuda::detail {
void f8f8bf16_rowwise(
at::Tensor XQ, // FP8
at::Tensor WQ, // FP8
at::Tensor x_scale, // FP32
at::Tensor w_scale, // FP32
c10::optional<at::Tensor> bias, // BF16
bool use_fast_accum,
at::Tensor& out) {
#if defined(BUILD_ROWWISE_FP8_KERNEL)
// Check datatypes.
TORCH_CHECK(
x_scale.dtype() == at::kFloat && w_scale.dtype() == at::kFloat,
"Scale tensors must be float32.");
if (bias.has_value()) {
TORCH_CHECK(
bias.value().dtype() == at::kFloat ||
bias.value().dtype() == at::kBFloat16,
"Bias type must be bfloat16 or float32 if provided.");
}
// Extract problem size.
int M = XQ.size(0);
int N = WQ.size(1);
int K = XQ.size(1);
bool use_bias = bias.has_value();
bool bf16_bias = use_bias && bias.value().dtype() == at::kBFloat16;
// Templatize based on input dtype.
bool use_e5m2 = XQ.dtype() == at::kFloat8_e5m2;
TORCH_CHECK(WQ.dtype() == at::kFloat8_e4m3fn, "For row-wise scaling the second input is required to be a float8_e4m3fn dtype.");
if (use_bias) {
if (bf16_bias) {
if (use_fast_accum) {
if (use_e5m2) {
return dispatch_fp8_rowwise_kernel<
cutlass::float_e5m2_t,
true,
true,
cutlass::bfloat16_t>(XQ, WQ, x_scale, w_scale, bias, out);
} else {
return dispatch_fp8_rowwise_kernel<
cutlass::float_e4m3_t,
true,
true,
cutlass::bfloat16_t>(XQ, WQ, x_scale, w_scale, bias, out);
}
} else {
if (use_e5m2) {
return dispatch_fp8_rowwise_kernel<
cutlass::float_e5m2_t,
false,
true,
cutlass::bfloat16_t>(XQ, WQ, x_scale, w_scale, bias, out);
} else {
return dispatch_fp8_rowwise_kernel<
cutlass::float_e4m3_t,
false,
true,
cutlass::bfloat16_t>(XQ, WQ, x_scale, w_scale, bias, out);
}
}
} else {
if (use_fast_accum) {
if (use_e5m2) {
return dispatch_fp8_rowwise_kernel<
cutlass::float_e5m2_t,
true,
true,
float>(XQ, WQ, x_scale, w_scale, bias, out);
} else {
return dispatch_fp8_rowwise_kernel<
cutlass::float_e4m3_t,
true,
true,
float>(XQ, WQ, x_scale, w_scale, bias, out);
}
} else {
if (use_e5m2) {
return dispatch_fp8_rowwise_kernel<
cutlass::float_e5m2_t,
false,
true,
float>(XQ, WQ, x_scale, w_scale, bias, out);
} else {
return dispatch_fp8_rowwise_kernel<
cutlass::float_e4m3_t,
false,
true,
float>(XQ, WQ, x_scale, w_scale, bias, out);
}
}
}
} else {
if (use_fast_accum) {
if (use_e5m2) {
return dispatch_fp8_rowwise_kernel<
cutlass::float_e5m2_t,
true,
false,
float>(XQ, WQ, x_scale, w_scale, bias, out);
} else {
return dispatch_fp8_rowwise_kernel<
cutlass::float_e4m3_t,
true,
false,
float>(XQ, WQ, x_scale, w_scale, bias, out);
}
} else {
if (use_e5m2) {
return dispatch_fp8_rowwise_kernel<
cutlass::float_e5m2_t,
false,
false,
float>(XQ, WQ, x_scale, w_scale, bias, out);
} else {
return dispatch_fp8_rowwise_kernel<
cutlass::float_e4m3_t,
false,
false,
float>(XQ, WQ, x_scale, w_scale, bias, out);
}
}
}
#else // BUILD_ROWWISE_FP8_KERNEL
TORCH_CHECK(false, "Rowwise scaling is not currenlty supported on your device");
#endif
}
} // namespace at::cuda::detail

View File

@ -0,0 +1,15 @@
#pragma once
#include <ATen/core/TensorBase.h>
#include <c10/util/Optional.h>
namespace at::cuda::detail {
TORCH_API void f8f8bf16_rowwise(
at::Tensor XQ, // FP8
at::Tensor WQ, // FP8
at::Tensor x_scale, // FP32
at::Tensor w_scale, // FP32
c10::optional<at::Tensor> bias, // BF16
bool use_fast_accum,
at::Tensor& out);
} // at::cuda::detail

View File

@ -1,6 +1,5 @@
#include <ATen/native/SpectralOpsUtils.h>
#include <ATen/native/mps/MPSGraphSonomaOps.h>
#include <ATen/native/mps/MPSGraphVenturaOps.h>
#include <ATen/native/mps/OperationUtils.h>
#ifndef AT_PER_OPERATOR_HEADERS

View File

@ -38,19 +38,6 @@ static void addc_mul_div_out_mps(const Tensor& self,
};
@autoreleasepool {
bool executeGatherOpOnSelf =
!(self.is_contiguous(MemoryFormat::Contiguous) || self.is_contiguous(MemoryFormat::ChannelsLast) ||
self.is_contiguous(MemoryFormat::ChannelsLast3d));
Tensor output_ = at::empty_like(self, executeGatherOpOnSelf ? MemoryFormat::Contiguous : MemoryFormat::Preserve);
bool executeGatherOpOnFirstTensor =
!(tensor1.is_contiguous(MemoryFormat::Contiguous) || tensor1.is_contiguous(MemoryFormat::ChannelsLast) ||
tensor1.is_contiguous(MemoryFormat::ChannelsLast3d));
bool executeGatherOpOnSecondTensor =
!(tensor2.is_contiguous(MemoryFormat::Contiguous) || tensor2.is_contiguous(MemoryFormat::ChannelsLast) ||
tensor2.is_contiguous(MemoryFormat::ChannelsLast3d));
string key = op_name + getTensorsStringKey({self, tensor1, tensor2});
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
@ -85,12 +72,10 @@ static void addc_mul_div_out_mps(const Tensor& self,
});
// Inputs as placeholders
Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor, self, nil, executeGatherOpOnSelf);
Placeholder tensor1Placeholder = Placeholder(cachedGraph->firstTensor, tensor1, nil, executeGatherOpOnFirstTensor);
Placeholder tensor2Placeholder =
Placeholder(cachedGraph->secondTensor, tensor2, nil, executeGatherOpOnSecondTensor);
Placeholder outputPlaceholder =
Placeholder(cachedGraph->outputTensor, executeGatherOpOnSelf ? output_ : output, nil, false);
Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor, self);
Placeholder tensor1Placeholder = Placeholder(cachedGraph->firstTensor, tensor1);
Placeholder tensor2Placeholder = Placeholder(cachedGraph->secondTensor, tensor2);
Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, output);
MPSScalar value_scalar = getMPSScalar(value_opt, self.scalar_type());
// Create dictionary of inputs and outputs
@ -102,10 +87,6 @@ static void addc_mul_div_out_mps(const Tensor& self,
};
runMPSGraph(mpsStream, cachedGraph->graph(), feeds, outputPlaceholder);
if (executeGatherOpOnSelf) {
output.copy_(output_);
}
}
}

View File

@ -237,5 +237,24 @@ Tensor std_quantized_cpu(
return result;
}
static Tensor std_quantized_cpu(
const Tensor& self,
DimnameList dim,
const std::optional<Scalar>& correction,
bool keepdim) {
return std_quantized_cpu(
self, dimnames_to_positions(self, dim), correction, keepdim);
}
static Tensor& std_out_quantized_cpu(
Tensor& result,
const Tensor& self,
DimnameList dim,
const std::optional<Scalar>& correction,
bool keepdim) {
return std_out_quantized_cpu(
self, dimnames_to_positions(self, dim), correction, keepdim, result);
}
} // namespace native
} // namespace at

View File

@ -606,6 +606,15 @@ Tensor log_softmax_backward_sparse_cpu(
return grad_input;
}
static Tensor _sparse_softmax(const Tensor& input_, const int64_t dim_) {
auto result = [&]() {
NoNamesGuard guard;
return at::_sparse_softmax(input_, dim_, false);
}();
namedinference::propagate_names(result, input_);
return result;
}
Tensor _sparse_softmax(const Tensor& input_, const int64_t dim_, std::optional<ScalarType> dtype) {
auto result = [&]() {
NoNamesGuard guard;

View File

@ -190,8 +190,8 @@ struct AttentionKernel {
unsigned long long dropout_batch_head_rng_offset = 0;
float dropout_prob = 0.0f;
at::PhiloxCudaState rng_engine_inputs = at::PhiloxCudaState(0, 0);
int64_t* extragraph_offset = nullptr;
int64_t* seed = nullptr;
int64_t* extragraph_offset;
int64_t* seed;
// Moves pointers to what we should process
// Returns "false" if there is no work to do

View File

@ -0,0 +1,58 @@
import torch
from torch.utils.data import Dataset
def collate_sentences_lm(samples):
if len(samples) == 0:
return {}
id = torch.LongTensor([s["id"] for s in samples])
src_tokens = torch.stack([s["source"] for s in samples], 0)
tgt_tokens = torch.stack([s["target"] for s in samples], 0)
ntokens = len(samples) * len(samples[0]["target"])
src_lengths = torch.LongTensor([len(samples[0]["source"])] * len(samples))
batch = {
"id": id,
"nsentences": len(samples),
"ntokens": ntokens,
"input": src_tokens,
"target": tgt_tokens,
}
return batch
class BenchmarkLMDataset(Dataset):
"""
Dataset to benchmark a translation like seq2seq task.
Args:
vocab_size (int, optional): size of the vocabulary (default 10000).
max_source_positions (int, optional): max number of tokens in the
source sentence (default: 1024).
total_samples (int, optional): the total number of rows in the
dataset (default: 10000).
"""
def __init__(
self,
vocab_size=10000,
max_source_positions=1024,
total_samples=10000,
):
self.vocab_size = vocab_size
self.max_source_positions = max_source_positions
self.total_samples = total_samples
self.sizes = [self.max_source_positions] * self.total_samples
def __getitem__(self, index):
length = self.sizes[index]
source = torch.randint(1, self.vocab_size, (length,))
target = source.clone()
return {
"id": index,
"source": source,
"target": target,
}
def __len__(self):
return self.total_samples

View File

@ -0,0 +1,296 @@
import argparse
import math
import os
import time
from benchmark_dataset import BenchmarkLMDataset, collate_sentences_lm
import torch
import torch.nn as nn
from torch.distributed import rpc
from torch.distributed.pipeline.sync import Pipe
from torch.distributed.pipeline.sync.utils import partition_model
from torch.optim import Adam
from torch.utils.data import DataLoader
def sizeof_fmt(num, suffix="B"):
for unit in ["", "Ki", "Mi", "Gi", "Ti"]:
if abs(num) < 1024.0:
return f"{num:3.2f}{unit}B"
num /= 1024.0
def init_random_seed(seed: int):
import numpy
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
numpy.random.seed(seed)
iteration_count = 0
class EmbeddingLayer(nn.Embedding):
def __init__(self, ntoken, ninp, initrange):
super().__init__(ntoken, ninp)
self.ninp = ninp
nn.init.uniform_(self.weight, -initrange, initrange)
def forward(self, src):
return super().forward(src) * math.sqrt(self.ninp)
class PositionalEncodingLayer(nn.Module):
def __init__(self, d_model, dropout=0.1, max_len=5000):
super().__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(
torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
)
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer("pe", pe)
def forward(self, x):
x = x + self.pe[: x.size(0), :]
return self.dropout(x)
class TransformerDecoderLayer(nn.TransformerEncoderLayer):
"""Though this class inherits from torch.nn.TransformerEncoderLayer,
it functions as a decoder in this model"""
def __init__(self, ninp, nhead, nhid, droupout):
super().__init__(ninp, nhead, nhid, droupout)
self.src_mask = None
def forward(self, src):
global iteration_count
iteration_count += 1
if self.src_mask is None or self.src_mask.size(0) != len(src):
device = src.device
mask = nn.Transformer.generate_square_subsequent_mask(len(src)).to(device)
self.src_mask = mask
return super().forward(src, self.src_mask)
class LinearLayer(nn.Linear):
def __init__(self, ninp, ntoken, initrange):
super().__init__(ninp, ntoken)
nn.init.zeros_(self.bias)
nn.init.uniform_(self.weight, -initrange, initrange)
class TransformerLMSequential(nn.Sequential):
"""A small language model based on the design of GPT-2 using nn.Sequential
for compatibility with Pipe"""
def __init__(self, ntokens, ninp, nhead, nhid, dropout, initrange, ndecoder):
layers = [
EmbeddingLayer(ntokens, ninp, initrange),
PositionalEncodingLayer(ninp, dropout),
]
for _ in range(ndecoder):
layers.append(TransformerDecoderLayer(ninp, nhead, nhid, dropout))
layers.append(LinearLayer(ninp, ntokens, initrange))
super().__init__(*layers)
def make_model(args, device, ntokens):
ninp = 2048 # embedding dimension
nhid = (
2048 # the dimension of the feedforward network model in nn.TransformerEncoder
)
nhead = 32 # the number of heads in the multiheadattention models
dropout = 0
initrange = 0.1
ndecoder = args.num_decoder_layers
model = TransformerLMSequential(
ntokens, ninp, nhead, nhid, dropout, initrange, ndecoder
).to(device)
criterion = nn.CrossEntropyLoss()
lr = 0.01 # learning rate
def make_adam(model):
return Adam(model.parameters(), lr=lr)
optimizer = make_adam
return model, criterion, optimizer
def train(lm_dataloader, model, criterion, optimizer, vocab_size, args):
model.train()
vocab_size = 10000
total_loss = 0.0
start_time = time.time()
word_counter = 0
optimizer = optimizer(model)
def get_first_device(model):
if model.devices:
return model.devices[0]
else:
return torch.cuda.current_device()
def get_last_device(model):
if model.devices:
return model.devices[-1]
else:
return torch.cuda.current_device()
print(
f"Number of parameters for model: {sum(p.numel() for p in model.parameters())}"
)
for i, batch in enumerate(lm_dataloader):
bi = batch["input"]
if args.max_batch and i > args.max_batch:
break
optimizer.zero_grad()
try:
tmp = batch["input"].to(get_first_device(model))
output = model(tmp).local_value()
except Exception as e:
raise RuntimeError(
f"training failed on {torch.distributed.get_rank()}"
) from e
target = batch["target"].to(get_last_device(model))
output = output.to(target.device)
loss = criterion(output.view(-1, vocab_size), target.view(-1))
loss.backward()
del target
del output
torch.nn.utils.clip_grad_value_(model.parameters(), 0.05)
optimizer.step()
total_loss += loss.item()
log_interval = 1
word_counter += batch["ntokens"]
if i % log_interval == 0 and i > 0:
cur_loss = total_loss / log_interval
elapsed = time.time() - start_time
print(
f"| batch {i:5d} | wps {word_counter / elapsed:5.2f} | loss {cur_loss:5.2f} | ppl {math.exp(cur_loss):8.2f}"
)
word_counter = 0
total_loss = 0
start_time = time.time()
print("Peak memory usage for GPUs: ", end="")
for i in range(len(model.devices)):
print(
f"cuda:{i}: {sizeof_fmt(torch.cuda.memory_stats(i)['allocated_bytes.all.peak'])}, ",
end="",
)
print()
def generate_balance(num_devices, num_layers):
balance = []
layers_assigned = 0
for i in range(num_devices):
x = (num_layers - layers_assigned) / (num_devices - i)
if x.is_integer():
balance.append(int(x))
layers_assigned += x
else:
balance.append(math.ceil(x))
layers_assigned += math.ceil(x)
return balance
def make_model_and_data(args, device):
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
vocab_size = 10000
model, criterion, optimizer = make_model(args, device, vocab_size)
lm_dataset = BenchmarkLMDataset()
lm_dataloader = DataLoader(
lm_dataset,
batch_size=args.batch_size,
shuffle=True,
num_workers=0,
collate_fn=collate_sentences_lm,
)
return {
"model": model,
"criterion": criterion,
"optimizer": optimizer,
"data": lm_dataloader,
"vocab_size": vocab_size,
}
def bench_single_process(args):
os.environ.update({"MASTER_ADDR": args.host})
os.environ.update({"MASTER_PORT": "10638"})
rpc.init_rpc(
"worker",
rank=0,
world_size=1,
)
num_devices = torch.cuda.device_count() if torch.cuda.is_available() else 1
num_devices = min(args.num_devices, num_devices)
assert num_devices > 0
init_random_seed(0)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
blob = make_model_and_data(args, None)
model = blob["model"]
balance = generate_balance(num_devices, len(model))
model = partition_model(model, balance)
p = Pipe(model, chunks=args.chunks, checkpoint=args.checkpoint)
del model
del blob["model"]
train(
blob["data"], p, blob["criterion"], blob["optimizer"], blob["vocab_size"], args
)
parser = argparse.ArgumentParser(description="benchmark")
parser.add_argument("--host", "-o", type=str, default="localhost", help="hostname")
parser.add_argument(
"--chunks", type=int, default=4, help="number of microbatches per batch"
)
parser.add_argument("--batch-size", type=int, default=8, help="size of a batch")
parser.add_argument("--max-batch", type=int, default=10, help="Max number of batches")
parser.add_argument(
"--num-decoder-layers",
type=int,
default=10,
help="Number of decoder layers in the model",
)
parser.add_argument(
"--checkpoint",
default="except_last",
choices=["always", "except_last", "never"],
help="Checkpointing strategy for pipe",
)
parser.add_argument(
"--num-devices", type=int, default=4, help="Number of GPU devices to use"
)
if __name__ == "__main__":
args = parser.parse_args()
print(f"Running benchmark with args: {args}")
bench_single_process(args)

View File

@ -218,7 +218,7 @@ tf_mixnet_l,pass,6
tinynet_a,fail_accuracy,6
tinynet_a,pass,6

1 name accuracy graph_breaks
218
219
220
221
222
223
224

View File

@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
hf_T5_generate,pass,5
hf_T5_generate,fail_to_run,5

1 name accuracy graph_breaks
178
179
180
181
182
183
184

View File

@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
hf_T5_generate,pass,5
hf_T5_generate,fail_to_run,5

1 name accuracy graph_breaks
178
179
180
181
182
183
184

View File

@ -182,7 +182,7 @@ phlippe_densenet,pass,6
phlippe_resnet,pass,6
phlippe_resnet,fail_accuracy,6

1 name accuracy graph_breaks
182
183
184
185
186
187
188

View File

@ -182,7 +182,7 @@ phlippe_densenet,pass,6
phlippe_resnet,pass,6
phlippe_resnet,fail_accuracy,6

1 name accuracy graph_breaks
182
183
184
185
186
187
188

View File

@ -218,7 +218,7 @@ tf_mixnet_l,pass,6
tinynet_a,fail_accuracy,6
tinynet_a,pass,6

1 name accuracy graph_breaks
218
219
220
221
222
223
224

View File

@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
hf_T5_generate,pass,5
hf_T5_generate,fail_to_run,5

1 name accuracy graph_breaks
178
179
180
181
182
183
184

View File

@ -6,7 +6,7 @@ adv_inception_v3,pass,6
beit_base_patch16_224,fail_accuracy,7
beit_base_patch16_224,pass,7

1 name accuracy graph_breaks
6 coat_lite_mini pass 6
7 convit_base pass 7
8 convmixer_768_32 pass 5
9 convnext_base pass 7
10 crossvit_9_240 pass 7
11 cspdarknet53 pass 7
12 deit_base_distilled_patch16_224 pass 7

View File

@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0
hf_T5_generate,pass,5
hf_T5_generate,fail_to_run,5

1 name accuracy graph_breaks
178
179
180
181
182
183
184

View File

@ -6,7 +6,7 @@ adv_inception_v3,pass,6
beit_base_patch16_224,fail_accuracy,7
beit_base_patch16_224,pass,7

1 name accuracy graph_breaks
6 coat_lite_mini pass 6
7 convit_base pass 7
8 convmixer_768_32 pass 5
9 convnext_base pass 7
10 crossvit_9_240 pass 7
11 cspdarknet53 pass 7
12 deit_base_distilled_patch16_224 pass 7

View File

@ -143,7 +143,6 @@ CI_SKIP_DYNAMIC_BATCH_ONLY = {
"pyhpc_equation_of_state",
"pyhpc_turbulent_kinetic_energy",
"detectron2_fcos_r_50_fpn",
"hf_T5_generate",
}
# These models currently fail accuracy with eager Adam optimizer
@ -3975,12 +3974,9 @@ def run(runner, args, original_dir=None):
assert "cuda" in args.devices, "Quantization requires CUDA device."
assert args.bfloat16, "Quantization requires dtype bfloat16."
try:
from torchao_backend import setup_baseline, torchao_optimize_ctx
from .torchao_backend import setup_baseline, torchao_optimize_ctx
except ImportError:
from userbenchmark.dynamo.dynamobench.torchao_backend import (
setup_baseline,
torchao_optimize_ctx,
)
from torchao_backend import setup_baseline, torchao_optimize_ctx
setup_baseline()
baseline_ctx = functools.partial(

View File

@ -827,7 +827,6 @@ libtorch_python_core_sources = [
"torch/csrc/dynamo/guards.cpp",
"torch/csrc/dynamo/init.cpp",
"torch/csrc/functorch/init.cpp",
"torch/csrc/fx/node.cpp",
"torch/csrc/mps/Module.cpp",
"torch/csrc/mtia/Module.cpp",
"torch/csrc/inductor/aoti_runner/pybind.cpp",

View File

@ -49,15 +49,33 @@ class C10_API SymNodeImpl : public c10::intrusive_ptr_target {
virtual SymNode mul(const SymNode& other) {
TORCH_CHECK(false, "NYI");
}
// NB: legacy, prefer float_truediv or int_truediv
virtual SymNode truediv(const SymNode& other) {
TORCH_CHECK(false, "NYI");
}
virtual SymNode float_truediv(const SymNode& other) {
return truediv(other);
}
virtual SymNode int_truediv(const SymNode& other) {
return truediv(other);
}
// NB: legacy, prefer float_pow or pow_by_natural
virtual SymNode pow(const SymNode& other) {
TORCH_CHECK(false, "NYI");
}
virtual SymNode float_pow(const SymNode& other) {
return pow(other);
}
virtual SymNode pow_by_natural(const SymNode& other) {
return pow(other);
}
// NB: legacy, prefer int_floordiv
virtual SymNode floordiv(const SymNode& other) {
TORCH_CHECK(false, "NYI");
}
virtual SymNode int_floordiv(const SymNode& other) {
return floordiv(other);
}
virtual SymNode mod(const SymNode& other) {
TORCH_CHECK(false, "NYI");
}

View File

@ -10,7 +10,7 @@ endif
CUDA_VERSION_SHORT ?= 12.1
CUDA_VERSION ?= 12.1.1
CUDNN_VERSION ?= 9
CUDNN_VERSION ?= 8
BASE_RUNTIME = ubuntu:22.04
BASE_DEVEL = nvidia/cuda:$(CUDA_VERSION)-devel-ubuntu22.04
CMAKE_VARS ?=

View File

@ -606,6 +606,47 @@ coverage_ignore_functions = [
# torch.distributed.optim.utils
"as_functional_optim",
"register_functional_optim",
# torch.distributed.pipeline.sync.checkpoint
"checkpoint",
"enable_checkpointing",
"enable_recomputing",
"is_checkpointing",
"is_recomputing",
"restore_rng_states",
"save_rng_states",
# torch.distributed.pipeline.sync.dependency
"fork",
"join",
# torch.distributed.pipeline.sync.microbatch
"check",
"gather",
"scatter",
# torch.distributed.pipeline.sync.phony
"get_phony",
# torch.distributed.pipeline.sync.skip.layout
"inspect_skip_layout",
# torch.distributed.pipeline.sync.skip.tracker
"current_skip_tracker",
"use_skip_tracker",
# torch.distributed.pipeline.sync.stream
"as_cuda",
"current_stream",
"default_stream",
"get_device",
"is_cuda",
"new_stream",
"record_stream",
"use_device",
"use_stream",
"wait_stream",
# torch.distributed.pipeline.sync.utils
"partition_model",
# torch.distributed.pipeline.sync.worker
"create_workers",
"spawn_workers",
"worker",
# torch.distributed.pipelining.PipelineSchedule
"step",
# torch.distributed.rendezvous
"register_rendezvous_handler",
"rendezvous",
@ -2609,6 +2650,52 @@ coverage_ignore_classes = [
"PostLocalSGDOptimizer",
# torch.distributed.optim.zero_redundancy_optimizer
"ZeroRedundancyOptimizer",
# torch.distributed.pipeline.sync.batchnorm
"DeferredBatchNorm",
# torch.distributed.pipeline.sync.checkpoint
"Checkpoint",
"Checkpointing",
"Context",
"Function",
"Recompute",
"ThreadLocal",
# torch.distributed.pipeline.sync.copy
"Context",
"Copy",
"Wait",
# torch.distributed.pipeline.sync.dependency
"Fork",
"Join",
# torch.distributed.pipeline.sync.microbatch
"Batch",
"NoChunk",
# torch.distributed.pipeline.sync.pipe
"BalanceError",
"Pipe",
"PipeSequential",
"WithDevice",
# torch.distributed.pipeline.sync.pipeline
"Pipeline",
# torch.distributed.pipeline.sync.skip.layout
"SkipLayout",
# torch.distributed.pipeline.sync.skip.namespace
"Namespace",
# torch.distributed.pipeline.sync.skip.portal
"Context",
"Portal",
"PortalBlue",
"PortalCopy",
"PortalOrange",
# torch.distributed.pipeline.sync.skip.skippable
"Skippable",
# torch.distributed.pipeline.sync.skip.tracker
"SkipTracker",
"SkipTrackerThroughPotals",
"ThreadLocal",
# torch.distributed.pipeline.sync.stream
"CPUStreamType",
# torch.distributed.pipeline.sync.worker
"Task",
# torch.distributed.rpc.api
"AllGatherStates",
"RRef",

View File

@ -4,407 +4,184 @@
Pipeline Parallelism
####################
.. note::
``torch.distributed.pipelining`` is currently in alpha state and under
development. API changes may be possible. It was migrated from the `PiPPy
<https://github.com/pytorch/PiPPy>`_ project.
.. note:: ``torch.distributed.pipelining`` is a package migrated from the `PiPPy <https://github.com/pytorch/PiPPy>`_ project. It is currently in alpha state and under extensive development. For examples that work with our APIs, please refer to PiPPy's `examples <https://github.com/pytorch/PiPPy/tree/main/examples>`_ directory.
Why Pipeline Parallel?
**********************
Pipeline Parallelism is one of the **primitive** parallelism for deep learning.
It allows the **execution** of a model to be partitioned such that multiple
**micro-batches** can execute different parts of the model code concurrently.
Pipeline parallelism can be an effective technique for:
* large-scale training
* bandwidth-limited clusters
* large model inference.
The above scenarios share a commonality that the computation per device cannot
hide the communication of conventional parallelism, for example, the weight
all-gather of FSDP.
One of the most important techniques for advancing the state of the art in deep learning is scaling. Common techniques for scaling neural networks include *data parallelism*, *tensor/operation parallelism*, and *pipeline parallelism* (or *pipelining*). Pipelining is a technique in which the *code* of the model is partitioned and multiple *micro-batches* execute different parts of the model code concurrently. In many cases, pipeline parallelism can be an effective technique for scaling, in particular for large-scale jobs or bandwidth-limited interconnects. To learn more about pipeline parallelism in deep learning, see `this article <https://www.deepspeed.ai/tutorials/pipeline/>`_.
What is ``torch.distributed.pipelining``?
*****************************************
While promising for scaling, pipelining is often difficult to implement because
it needs to **partition the execution** of a model in addition to model weights.
The partitioning of execution often requires intrusive code changes to your
model. Another aspect of complexity comes from **scheduling micro-batches in a
distributed environment**, with **data flow dependency** considered.
.. automodule:: torch.distributed.pipelining
The ``pipelining`` package provides a toolkit that does said things
**automatically** which allows easy implementation of pipeline parallelism
on **general** models.
.. currentmodule:: torch.distributed.pipelining
It consists of two parts: a
**splitting frontend** and a **distributed runtime**.
The splitting frontend takes your model code as-is, splits it up into "model
partitions", and captures the data-flow relationship. The distributed runtime
executes the pipeline stages on different devices in parallel, handling things
like micro-batch splitting, scheduling, communication, and gradient propagation,
etc.
While promising for scaling, pipelining is often difficult to implement, requiring intrusive code changes to model code and difficult-to-implement runtime orchestration code. ``torch.distributed.pipelining`` aims to provide **a toolkit that does said things automatically to allow high-productivity scaling of models.** It consists of a **compiler** and a **runtime** stack for easy pipelining of PyTorch models. In particular, it provides the following features:
Overall, the ``pipelining`` package provides the following features:
* Splitting of model code based on your specification. The goal is for the user to provide model code as-is to the system for parallelization, without having to make heavyweight modifications to make parallelism work. The specification is also simple.
* Support for rich pipeline scheduling paradigms, including GPipe, 1F1B, Interleaved 1F1B and Looped BFS. It will be also easy to customize your own schedule under this framework.
* First-class support for cross-host pipeline parallelism, as this is where PP is typically used (over slower interconnects).
* Composability with other PyTorch parallel schemes such as data parallelism (DDP, FSDP) or tensor parallelism (overall, known as "3d parallelism").
* Splitting of model code based on simple specification.
* Rich support for pipeline schedules, including GPipe, 1F1B,
Interleaved 1F1B and Looped BFS, and providing the infrastruture for writing
customized schedules.
* First-class support for cross-host pipeline parallelism, as this is where PP
is typically used (over slower interconnects).
* Composability with other PyTorch parallel techniques such as data parallel
(DDP, FSDP) or tensor parallel. The `TorchTitan
<https://github.com/pytorch/torchtitan>`_ project demonstrates a "3D parallel"
application on the Llama model.
Examples
********
In the `PiPPy <https://github.com/pytorch/PiPPy>`_ repo where this package is migrated from, we provide rich examples based on realistic models. In particular, we show how to apply pipelining without any model code change. You can refer to the `HuggingFace examples directory <https://github.com/pytorch/PiPPy/tree/main/examples/huggingface>`_. Popular examples include: `GPT2 <https://github.com/pytorch/PiPPy/tree/main/examples/huggingface/pippy_gpt2.py>`_, and `LLaMA <https://github.com/pytorch/PiPPy/tree/main/examples/llama>`_.
Techniques Explained
********************
``torch.distributed.pipelining`` consists of two parts: a *compiler* and a *runtime*. The compiler takes your model code, splits it up, and transforms it into a ``Pipe``, which is a wrapper that describes the model at each pipeline stage and their data-flow relationship. The runtime executes the ``PipelineStage`` in parallel, handling things like micro-batch splitting, scheduling, communication, and gradient propagation, etc. We will cover the APIs for these concepts in this section.
Splitting a Model with ``pipeline``
===================================
To see how we can split a model into a pipeline, let's first take an example trivial neural network:
.. code-block:: python
import torch
class MyNetworkBlock(torch.nn.Module):
def __init__(self, in_dim, out_dim):
super().__init__()
self.lin = torch.nn.Linear(in_dim, out_dim)
def forward(self, x):
x = self.lin(x)
x = torch.relu(x)
return x
Step 1: build ``PipelineStage`` for execution
*********************************************
class MyNetwork(torch.nn.Module):
def __init__(self, in_dim, layer_dims):
super().__init__()
Before we can use a ``PipelineSchedule``, we need to create ``PipelineStage``
objects that wrap the part of the model running in that stage. The
``PipelineStage`` is responsible for allocating communication buffers and
creating send/recv ops to communicate with its peers. It manages intermediate
buffers e.g. for the outputs of forward that have not been consumed yet, and it
provides a utility for running the backwards for the stage model.
prev_dim = in_dim
for i, dim in enumerate(layer_dims):
setattr(self, f'layer{i}', MyNetworkBlock(prev_dim, dim))
prev_dim = dim
A ``PipelineStage`` needs to know the input and output shapes for the stage
model, so that it can correctly allocate communication buffers. The shapes must
be static, e.g. at runtime the shapes can not change from step to step. A class
``PipeliningShapeError`` will be raised if runtime shapes do not match the
expected shapes. When composing with other paralleisms or applying mixed
precision, these techniques must be taken into account so the ``PipelineStage``
knows the correct shape (and dtype) for the output of the stage module at
runtime.
self.num_layers = len(layer_dims)
# 10 output classes
self.output_proj = torch.nn.Linear(layer_dims[-1], 10)
Users may construct a ``PipelineStage`` instance directly, by passing in an
``nn.Module`` representing the portion of the model that should run on the
stage. This may require changes to the original model code. See the example
in :ref:`option_1_manual`.
def forward(self, x):
for i in range(self.num_layers):
x = getattr(self, f'layer{i}')(x)
Alternatively, the splitting frontend can use graph partitioning to split your
model into a series of ``nn.Module`` automatically. This technique requires the
model is traceable with ``torch.Export``. Composability of the resulting
``nn.Module`` with other parallelism techniques is experimental, and may require
some workarounds. Usage of this frontend may be more appealing if the user
cannot easily change the model code. See :ref:`option_2_tracer` for more
information.
return self.output_proj(x)
Step 2: use ``PipelineSchedule`` for execution
**********************************************
in_dim = 512
layer_dims = [512, 1024, 256]
mn = MyNetwork(in_dim, layer_dims).to(device)
We can now attach the ``PipelineStage`` to a pipeline schedule, and run the
schedule with input data. Here is a GPipe example:
This network is written as free-form Python code; it has not been modified for any specific parallelism technique.
Let us see our usage of the ``pipeline`` interface:
.. code-block:: python
from torch.distributed.pipelining import annotate_split_points, pipeline, Pipe, SplitPoint
annotate_split_points(mn, {'layer0': SplitPoint.END,
'layer1': SplitPoint.END})
batch_size = 32
example_input = torch.randn(batch_size, in_dim, device=device)
chunks = 4
pipe = pipeline(mn, chunks, example_args=(example_input,))
print(pipe)
::
************************************* pipe *************************************
GraphModule(
(submod_0): GraphModule(
(layer0): InterpreterModule(
(lin): InterpreterModule()
)
)
(submod_1): GraphModule(
(layer1): InterpreterModule(
(lin): InterpreterModule()
)
)
(submod_2): GraphModule(
(layer2): InterpreterModule(
(lin): InterpreterModule()
)
(output_proj): InterpreterModule()
)
)
def forward(self, arg8_1):
submod_0 = self.submod_0(arg8_1); arg8_1 = None
submod_1 = self.submod_1(submod_0); submod_0 = None
submod_2 = self.submod_2(submod_1); submod_1 = None
return (submod_2,)
So what's going on here? First, ``pipeline`` turns our model into a directed acyclic graph (DAG) by tracing the model. Then, it groups together the operations and parameters into *pipeline stages*. Stages are represented as ``submod_N`` submodules, where ``N`` is a natural number.
We used ``annotate_split_points`` to specify that the code should be split and the end of ``layer0`` and ``layer1``. Our code has thus been split into *three* pipeline stages. Our library also provides ``SplitPoint.BEGINNING`` if a user wants to split before certain annotation point.
While the ``annotate_split_points`` API gives users a way to specify the split points without modifying the model, our library also provides an API for in-model annotation: ``pipe_split()``. For details, you can read `this example <https://github.com/pytorch/PiPPy/blob/main/test/test_pipe.py>`_.
This covers the basic usage of the ``Pipe`` API. For more information, please see the documentation.
Using ``PipelineSchedule`` for Execution
========================================
After transforming the model into a ``Pipe`` representation, we can run its stages in a distributed *runtime*. This can be done in two steps:
* instantiate a ``PipelineStage`` from a stage module of ``Pipe``;
* run the ``PipelineStage`` according to a ``PipelineSchedule``.
First off, let us instantiate a ``PipelineStage`` instance:
.. code-block:: python
# We are using `torchrun` to run this example with multiple processes.
# `torchrun` defines two environment variables: `RANK` and `WORLD_SIZE`.
rank = int(os.environ["RANK"])
world_size = int(os.environ["WORLD_SIZE"])
# Initialize distributed environment
import torch.distributed as dist
dist.init_process_group(rank=rank, world_size=world_size)
# Pipeline stage is our main pipeline runtime. It takes in the pipe object,
# the rank of this process, and the device.
from torch.distributed.pipelining import PipelineStage
stage = PipelineStage(pipe, rank, device)
We can now attach the ``PipelineStage`` to a pipeline schedule, GPipe for example, and run with data:
.. code-block:: python
from torch.distributed.pipelining import ScheduleGPipe
schedule = ScheduleGPipe(stage, chunks)
# Create a schedule
schedule = ScheduleGPipe(stage, n_microbatches)
# Input data (whole batch)
# Input data
x = torch.randn(batch_size, in_dim, device=device)
# Run the pipeline with input `x`
# `x` will be divided into microbatches automatically
# Run the pipeline with input `x`. Divide the batch into 4 micro-batches
# and run them in parallel on the pipeline
if rank == 0:
schedule.step(x)
else:
output = schedule.step()
Note that the above code needs to be launched for each worker, thus we use a
launcher service to launch multiple processes:
Note that since we split our model into three stages, we must run this script with three workers. For this example, we will use ``torchrun`` to run multiple processes within a single machine for demonstration purposes. We can collect up all of the code blocks above into a file named `example.py <https://github.com/pytorch/PiPPy/tree/main/examples/basic>`_ and then run it with ``torchrun`` like so:
.. code-block:: bash
torchrun --nproc_per_node=2 example.py
torchrun --nproc_per_node=3 example.py
Options for Splitting a Model
*****************************
.. _option_1_manual:
Option 1: splitting a model manually
====================================
To directly construct a ``PipelineStage``, the user is responsible for providing
a single ``nn.Module`` instance that owns the relevant ``nn.Parameters`` and
``nn.Buffers``, and defines a ``forward()`` method that executes the operations
relevant for that stage. For example, a condensed version of the Transformer
class defined in Torchtitan shows a pattern of building an easily partitionable
model.
.. code-block:: python
class Transformer(nn.Module):
def __init__(self, model_args: ModelArgs):
super().__init__()
self.tok_embeddings = nn.Embedding(...)
# Using a ModuleDict lets us delete layers witout affecting names,
# ensuring checkpoints will correctly save and load.
self.layers = torch.nn.ModuleDict()
for layer_id in range(model_args.n_layers):
self.layers[str(layer_id)] = TransformerBlock(...)
self.output = nn.Linear(...)
def forward(self, tokens: torch.Tensor):
# Handling layers being 'None' at runtime enables easy pipeline splitting
h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens
for layer in self.layers.values():
h = layer(h, self.freqs_cis)
h = self.norm(h) if self.norm else h
output = self.output(h).float() if self.output else h
return output
A model defined in this manner can be easily configured per stage by first
initializing the whole model (using meta-device to avoid OOM errors), deleting
undesired layers for that stage, and then creating a PipelineStage that wraps
the model. For example:
.. code-block:: python
with torch.device("meta"):
assert num_stages == 2, "This is a simple 2-stage example"
# we construct the entire model, then delete the parts we do not need for this stage
# in practice, this can be done using a helper function that automatically divides up layers across stages.
model = Transformer()
if stage_index == 0:
# prepare the first stage model
del model.layers["1"]
model.norm = None
model.output = None
elif stage_index == 1:
# prepare the second stage model
model.tok_embeddings = None
del model.layers["0"]
from torch.distributed.pipelining import PipelineStage
stage = PipelineStage(
model,
stage_index,
num_stages,
device,
input_args=example_input_microbatch,
)
The ``PipelineStage`` requires an example argument ``input_args`` representing
the runtime input to the stage, which would be one microbatch worth of input
data. This argument is passed through the forward method of the stage module to
determine the input and output shapes required for communication.
When composing with other Data or Model parallelism techniques, ``output_args``
may also be required, if the output shape/dtype of the model chunk will be
affected.
.. _option_2_tracer:
Option 2: splitting a model automatically
=========================================
If you have a full model and do not want to spend time on modifying it into a
sequence of "model partitions", the ``pipeline`` API is here to help.
Here is a brief example:
.. code-block:: python
class Model(torch.nn.Module):
def __init__(self) -> None:
super().__init__()
self.emb = torch.nn.Embedding(10, 3)
self.layers = torch.nn.ModuleList(
Layer() for _ in range(2)
)
self.lm = LMHead()
def forward(self, x: torch.Tensor) -> torch.Tensor:
x = self.emb(x)
for layer in self.layers:
x = layer(x)
x = self.lm(x)
return x
If we print the model, we can see multiple hierarchies, which makes it hard to split by hand::
Model(
(emb): Embedding(10, 3)
(layers): ModuleList(
(0-1): 2 x Layer(
(lin): Linear(in_features=3, out_features=3, bias=True)
)
)
(lm): LMHead(
(proj): Linear(in_features=3, out_features=3, bias=True)
)
)
Let us see how the ``pipeline`` API works:
.. code-block:: python
from torch.distributed.pipelining import pipeline, SplitPoint
x = torch.LongTensor([1, 2, 4, 5])
pipe = pipeline(
module=mod,
num_chunks=1,
example_args=(x,),
split_spec={
"layers.1": SplitPoint.BEGINNING,
}
)
The ``pipeline`` API splits your model given a ``split_spec``, where
``SplitPoint.BEGINNING`` stands for adding a split point
*before* execution of certain submodule in the ``forward`` function, and
similarly, ``SplitPoint.END`` for split point *after* such.
If we ``print(pipe)``, we can see::
GraphModule(
(submod_0): GraphModule(
(emb): InterpreterModule()
(layers): Module(
(0): InterpreterModule(
(lin): InterpreterModule()
)
)
)
(submod_1): GraphModule(
(layers): Module(
(1): InterpreterModule(
(lin): InterpreterModule()
)
)
(lm): InterpreterModule(
(proj): InterpreterModule()
)
)
)
def forward(self, x):
submod_0 = self.submod_0(x); x = None
submod_1 = self.submod_1(submod_0); submod_0 = None
return (submod_1,)
The "model partitions" are represented by submodules (``submod_0``,
``submod_1``), each of which is reconstructed with original model operations
and hierarchies. In addition, a "root-level" ``forward`` function is
reconstructed to capture the data flow between those partitions. Such data flow
will be replayed by the pipeline runtime later, in a distributed fashion.
The ``Pipe`` object provides a method for retrieving the "model partitions":
.. code-block:: python
stage_mod : nn.Module = pipe.get_stage_module(stage_idx)
You can also create a distributed stage runtime on a device using ``Pipe``:
.. code-block:: python
stage = pipe.build_stage(stage_idx, device, group)
.. note::
The ``pipeline`` frontend uses a tracer (``torch.export``) to capture your
model into a single graph. If your model is not full-graph'able, you can use
our manual frontend below.
Hugging Face Examples
*********************
In the `PiPPy <https://github.com/pytorch/PiPPy>`_ repo where this package was
original created, we kept examples based on unmodified Hugging Face models.
See the `examples/huggingface
<https://github.com/pytorch/PiPPy/tree/main/examples/huggingface>`_ directory.
Examples include:
* `GPT2 <https://github.com/pytorch/PiPPy/tree/main/examples/huggingface/pippy_gpt2.py>`_
* `Llama <https://github.com/pytorch/PiPPy/tree/main/examples/llama>`_
Technical Deep Dive
*******************
How does the ``pipeline`` API split a model?
============================================
First, the ``pipeline`` API turns our model into a directed acyclic graph (DAG)
by tracing the model. It traces the model using ``torch.export`` -- a PyTorch 2
full-graph capturing tool.
Then, it groups together the **operations and parameters** needed by a stage
into a reconstructed submodule: ``submod_0``, ``submod_1``, ...
Different from conventional submodule access methods like ``Module.children()``,
the ``pipeline`` API does not only cut the module structure of your model, but
also the **forward** function of your model.
This is necessary because model structure like ``Module.children()`` merely
captures information during ``Module.__init__()``, and does not capture any
information about ``Module.forward()``. Said differently, ``Module.children()``
lacks information about the following aspects key to pipelininig:
* Execution order of child modules in ``forward``
* Activation flows between child modules
* Whether there are any functional operators between child modules (for example,
``relu`` or ``add`` operations will not be captured by ``Module.children()``).
The ``pipeline`` API, on the contrary, makes sure that the ``forward`` behavior
is truly preserved. It also captures the activation flow between the partitions,
helping the distributed runtime to make correct send/receive calls without human
intervention.
Another flexibility of the ``pipeline`` API is that split points can be at
arbitrary levels within your model hierarchy. In the split partitions, the original model
hierarchy related to that partition will be reconstructed at no cost to you.
At a result, fully-qualified names (FQNs) pointing to a submodule or parameter
would be still valid, and services that relies on FQNs (such as FSDP, TP or
checkpointing) can still run with your partitioned modules with almost zero code
change.
Implementing Your Own Schedule
******************************
You can implement your own pipeline schedule by extending one of the following two class:
* ``PipelineScheduleSingle``
* ``PipelineScheduleMulti``
``PipelineScheduleSingle`` is for schedules that assigns *only one* stage per rank.
``PipelineScheduleMulti`` is for schedules that assigns multiple stages per rank.
For example, ``ScheduleGPipe`` and ``Schedule1F1B`` are subclasses of ``PipelineScheduleSingle``.
Whereas, ``ScheduleInterleaved1F1B`` and ``ScheduleLoopedBFS`` are subclasses of ``PipelineScheduleMulti``.
API Reference
*************
.. automodule:: torch.distributed.pipelining
Model Split APIs
Pipeline Transformation APIs
============================
The following set of APIs transform your model into a pipeline representation.
@ -417,8 +194,14 @@ The following set of APIs transform your model into a pipeline representation.
.. autoclass:: Pipe
.. autofunction:: annotate_split_points
.. autofunction:: pipe_split
.. autoclass:: ArgsChunkSpec
.. autoclass:: KwargsChunkSpec
Microbatch Utilities
====================
@ -435,20 +218,20 @@ Microbatch Utilities
Pipeline Stages
===============
.. automodule:: torch.distributed.pipelining.stage
.. automodule:: torch.distributed.pipelining.PipelineStage
.. currentmodule:: torch.distributed.pipelining.stage
.. currentmodule:: torch.distributed.pipelining.PipelineStage
.. autoclass:: PipelineStage
.. autofunction:: build_stage
.. autoclass:: ManualPipelineStage
Pipeline Schedules
==================
.. automodule:: torch.distributed.pipelining.schedules
.. automodule:: torch.distributed.pipelining.PipelineSchedule
.. currentmodule:: torch.distributed.pipelining.schedules
.. currentmodule:: torch.distributed.pipelining.PipelineSchedule
.. autoclass:: ScheduleGPipe
@ -458,8 +241,22 @@ Pipeline Schedules
.. autoclass:: ScheduleLoopedBFS
Implementing Your Own Schedule
==============================
You can implement your own pipeline schedule by extending one of the following two class:
* ``PipelineScheduleSingle``
* ``PipelineScheduleMulti``
``PipelineScheduleSingle`` is for schedules that assigns *only one* stage per rank.
``PipelineScheduleMulti`` is for schedules that assigns multiple stages per rank.
For example, ``ScheduleGPipe`` and ``Schedule1F1B`` are subclasses of ``PipelineScheduleSingle``.
Whereas, ``ScheduleInterleaved1F1B`` and ``ScheduleLoopedBFS`` are subclasses of ``PipelineScheduleMulti``.
.. currentmodule:: torch.distributed.pipelining.PipelineSchedule
.. autoclass:: PipelineScheduleSingle
:members:
.. autoclass:: PipelineScheduleMulti
:members:

View File

@ -876,6 +876,9 @@ If you are running single node training, it may be convenient to interactively b
.. py:module:: torch.distributed.nn.api
.. py:module:: torch.distributed.nn.jit
.. py:module:: torch.distributed.nn.jit.templates
.. py:module:: torch.distributed.pipeline
.. py:module:: torch.distributed.pipeline.sync
.. py:module:: torch.distributed.pipeline.sync.skip
.. py:module:: torch.distributed.tensor
.. py:module:: torch.distributed.algorithms.ddp_comm_hooks.ddp_zero_hook
.. py:module:: torch.distributed.algorithms.ddp_comm_hooks.debugging_hooks
@ -961,6 +964,22 @@ If you are running single node training, it may be convenient to interactively b
.. py:module:: torch.distributed.optim.post_localSGD_optimizer
.. py:module:: torch.distributed.optim.utils
.. py:module:: torch.distributed.optim.zero_redundancy_optimizer
.. py:module:: torch.distributed.pipeline.sync.batchnorm
.. py:module:: torch.distributed.pipeline.sync.checkpoint
.. py:module:: torch.distributed.pipeline.sync.copy
.. py:module:: torch.distributed.pipeline.sync.dependency
.. py:module:: torch.distributed.pipeline.sync.microbatch
.. py:module:: torch.distributed.pipeline.sync.phony
.. py:module:: torch.distributed.pipeline.sync.pipe
.. py:module:: torch.distributed.pipeline.sync.pipeline
.. py:module:: torch.distributed.pipeline.sync.skip.layout
.. py:module:: torch.distributed.pipeline.sync.skip.namespace
.. py:module:: torch.distributed.pipeline.sync.skip.portal
.. py:module:: torch.distributed.pipeline.sync.skip.skippable
.. py:module:: torch.distributed.pipeline.sync.skip.tracker
.. py:module:: torch.distributed.pipeline.sync.stream
.. py:module:: torch.distributed.pipeline.sync.utils
.. py:module:: torch.distributed.pipeline.sync.worker
.. py:module:: torch.distributed.remote_device
.. py:module:: torch.distributed.rendezvous
.. py:module:: torch.distributed.rpc.api

View File

@ -683,7 +683,6 @@ API Reference
.. automethod:: dynamic_shapes
.. autofunction:: torch.export.dynamic_shapes.refine_dynamic_shapes_from_suggested_fixes
.. autoclass:: Constraint
.. autoclass:: ExportedProgram

View File

@ -103,6 +103,7 @@ Features described in this documentation are classified by release status:
optim
complex_numbers
ddp_comm_hooks
pipeline
quantization
rpc
torch.random <random>

View File

@ -463,7 +463,7 @@ functions are used in the research community since complex numbers are not part
ordered field and so having complex valued loss does not make much sense.
It also turns out that no interesting real-valued objective fulfill the
Cauchy-Riemann equations. So the theory with holomorphic function cannot be
Cauchy-Riemann equations. So the theory with homomorphic function cannot be
used for optimization and most people therefore use the Wirtinger calculus.
Wirtinger Calculus comes into the picture ...
@ -602,7 +602,7 @@ Solving the above equations for :math:`\frac{\partial L}{\partial u}` and :math:
.. math::
\begin{aligned}
\frac{\partial L}{\partial u} = \frac{\partial L}{\partial s} + \frac{\partial L}{\partial s^*} \\
\frac{\partial L}{\partial v} = 1j * \left(\frac{\partial L}{\partial s} - \frac{\partial L}{\partial s^*}\right)
\frac{\partial L}{\partial v} = -1j * \left(\frac{\partial L}{\partial s} - \frac{\partial L}{\partial s^*}\right)
\end{aligned}
:label: [3]
@ -610,9 +610,9 @@ Substituting :eq:`[3]` in :eq:`[1]`, we get:
.. math::
\begin{aligned}
\frac{\partial L}{\partial z^*} &= \left(\frac{\partial L}{\partial s} + \frac{\partial L}{\partial s^*}\right) * \frac{\partial u}{\partial z^*} + 1j * \left(\frac{\partial L}{\partial s} - \frac{\partial L}{\partial s^*}\right) * \frac{\partial v}{\partial z^*} \\
\frac{\partial L}{\partial z^*} &= \left(\frac{\partial L}{\partial s} + \frac{\partial L}{\partial s^*}\right) * \frac{\partial u}{\partial z^*} - 1j * \left(\frac{\partial L}{\partial s} - \frac{\partial L}{\partial s^*}\right) * \frac{\partial v}{\partial z^*} \\
&= \frac{\partial L}{\partial s} * \left(\frac{\partial u}{\partial z^*} + \frac{\partial v}{\partial z^*} j\right) + \frac{\partial L}{\partial s^*} * \left(\frac{\partial u}{\partial z^*} - \frac{\partial v}{\partial z^*} j\right) \\
&= \frac{\partial L}{\partial s} * \frac{\partial (u + vj)}{\partial z^*} + \frac{\partial L}{\partial s^*} * \frac{\partial (u + vj)^*}{\partial z^*} \\
&= \frac{\partial L}{\partial s^*} * \frac{\partial (u + vj)}{\partial z^*} + \frac{\partial L}{\partial s} * \frac{\partial (u + vj)^*}{\partial z^*} \\
&= \frac{\partial L}{\partial s} * \frac{\partial s}{\partial z^*} + \frac{\partial L}{\partial s^*} * \frac{\partial s^*}{\partial z^*} \\
\end{aligned}

85
docs/source/pipeline.rst Normal file
View File

@ -0,0 +1,85 @@
.. _pipeline-parallelism:
Pipeline Parallelism
====================
Pipeline parallelism was original introduced in the
`Gpipe <https://arxiv.org/abs/1811.06965>`__ paper and is an efficient
technique to train large models on multiple GPUs.
.. warning ::
torch.distributed.pipeline is deprecated, so is this document. For
up-to-date pipeline parallel implementation, please refer to the
`PiPPy <https://github.com/pytorch/PiPPy>`__ library under the PyTorch
organization (Pipeline Parallelism for PyTorch).
Model Parallelism using multiple GPUs
-------------------------------------
Typically for large models which don't fit on a single GPU, model parallelism
is employed where certain parts of the model are placed on different GPUs.
Although, if this is done naively for sequential models, the training process
suffers from GPU under utilization since only one GPU is active at one time as
shown in the figure below:
.. figure:: _static/img/pipeline_parallelism/no_pipe.png
The figure represents a model with 4 layers placed on 4 different GPUs
(vertical axis). The horizontal axis represents training this model through
time demonstrating that only 1 GPU is utilized at a time
(`image source <https://arxiv.org/abs/1811.06965>`__).
Pipelined Execution
-------------------
To alleviate this problem, pipeline parallelism splits the input minibatch into
multiple microbatches and pipelines the execution of these microbatches across
multiple GPUs. This is outlined in the figure below:
.. figure:: _static/img/pipeline_parallelism/pipe.png
The figure represents a model with 4 layers placed on 4 different GPUs
(vertical axis). The horizontal axis represents training this model through
time demonstrating that the GPUs are utilized much more efficiently.
However, there still exists a bubble (as demonstrated in the figure) where
certain GPUs are not utilized.
(`image source <https://arxiv.org/abs/1811.06965>`__).
Pipe APIs in PyTorch
--------------------
.. autoclass:: torch.distributed.pipeline.sync.Pipe
:members: forward
Skip connections
^^^^^^^^^^^^^^^^
Certain models like `ResNeXt <https://pytorch.org/hub/pytorch_vision_resnext/>`__
are not completely sequential and have skip connections between layers.
Naively implementing as part of pipeline parallelism would imply that
we need to copy outputs for certain layers through multiple GPUs till
we eventually reach the GPU where the layer for the skip connection resides.
To avoid this copy overhead, we provide APIs below to stash and pop Tensors
in different layers of the model.
.. autofunction:: torch.distributed.pipeline.sync.skip.skippable.skippable
.. autoclass:: torch.distributed.pipeline.sync.skip.skippable.stash
.. autoclass:: torch.distributed.pipeline.sync.skip.skippable.pop
.. autofunction:: torch.distributed.pipeline.sync.skip.skippable.verify_skippables
Tutorials
---------
The following tutorials give a good overview of how to use the
:class:`~torch.distributed.pipeline.sync.Pipe` API to train your models with the
rest of the components that PyTorch provides:
- `Training Transformer models using Pipeline Parallelism <https://pytorch.org/tutorials/intermediate/pipeline_tutorial.html>`__
- `Training Transformer models using Distributed Data Parallel and Pipeline Parallelism <https://pytorch.org/tutorials/advanced/ddp_pipeline.html>`__
Acknowledgements
----------------
The implementation for pipeline parallelism is based on `fairscale's pipe implementation <https://github.com/facebookresearch/fairscale/tree/main/fairscale/nn/pipe>`__ and
`torchgpipe <https://github.com/kakaobrain/torchgpipe>`__. We would like to
thank both teams for their contributions and guidance towards bringing pipeline
parallelism into PyTorch.

View File

@ -37,9 +37,7 @@ For more details on ``torch.export``, you can refer to the :ref:`torch.export do
If you have a CUDA-enabled device on your machine and you installed PyTorch with CUDA support,
the following code will compile the model into a shared library for CUDA execution.
Otherwise, the compiled artifact will run on CPU. For better performance during CPU inference,
it is suggested to enable freezing by setting `export TORCHINDUCTOR_FREEZING=1`
before running the Python script below.
Otherwise, the compiled artifact will run on CPU.
.. code-block:: python

View File

@ -1640,6 +1640,16 @@ static PyObject* _dims(PyObject *self,
PY_END(nullptr)
}
static int64_t dim_index(const std::vector<mpy::obj<Dim>>& dims, mpy::hdl<Dim> dim) {
for (int64_t i = 0, N = dims.size(); i < N; ++i) {
if (dims[i].ptr() == dim.ptr()) {
return i;
}
}
return -1;
}
struct DotPart {
Slice<DimEntry> dims;
size_t total_size = 1;

View File

@ -385,6 +385,10 @@ bool is_int(handle h) {
return PyLong_Check(h.ptr());
}
bool is_float(handle h) {
return PyFloat_Check(h.ptr());
}
bool is_none(handle h) {
return h.ptr() == Py_None;
}

View File

@ -132,7 +132,6 @@ select = [
"RUF016", # type error non-integer index
"RUF017",
"RUF018", # no assignment in assert
"TCH",
"TRY002", # ban vanilla raise (todo fix NOQAs)
"TRY302",
"TRY401", # verbose-log-message
@ -176,10 +175,6 @@ select = [
# autogenerated #TODO figure out why file level noqa is ignored
"torch/_inductor/fx_passes/serialized_patterns/**" = ["F401", "F501"]
"torch/onnx/**" = [
"TCH001", # beartype may need runtime types
"TCH002",
"TCH003",
"TCH004",
"UP037", # ONNX does runtime type checking
]

View File

@ -1,23 +1,11 @@
#!/bin/bash
# Updates Triton to the pinned version for this copy of PyTorch
BRANCH=$(git rev-parse --abbrev-ref HEAD)
TRITON_VERSION="pytorch-triton==$(cat .ci/docker/triton_version.txt)"
DOWNLOAD_PYTORCH_ORG="https://download.pytorch.org/whl"
if [[ -z "${USE_XPU}" ]]; then
# Default install from PyTorch source
TRITON_VERSION="pytorch-triton==$(cat .ci/docker/triton_version.txt)"
DOWNLOAD_PYTORCH_ORG="https://download.pytorch.org/whl"
if [[ "$BRANCH" =~ .*release.* ]]; then
pip install --index-url ${DOWNLOAD_PYTORCH_ORG}/test/ $TRITON_VERSION
else
pip install --index-url ${DOWNLOAD_PYTORCH_ORG}/nightly/ $TRITON_VERSION+$(head -c 10 .ci/docker/ci_commit_pins/triton.txt)
fi
if [[ "$BRANCH" =~ .*release.* ]]; then
pip install --index-url ${DOWNLOAD_PYTORCH_ORG}/test/ $TRITON_VERSION
else
# Always install Triton for XPU from source
TRITON_XPU_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
TRITON_XPU_COMMIT_ID="$(cat .ci/docker/ci_commit_pins/triton-xpu.txt)"
# force-reinstall to ensure the pinned version is installed
pip install --force-reinstall "git+${TRITON_XPU_REPO}@${TRITON_XPU_COMMIT_ID}#subdirectory=python"
pip install --index-url ${DOWNLOAD_PYTORCH_ORG}/nightly/ $TRITON_VERSION+$(head -c 10 .ci/docker/ci_commit_pins/triton.txt)
fi

View File

@ -211,6 +211,30 @@
"torch.distributed.optim.utils": [
"Type"
],
"torch.distributed.pipeline.sync.pipe": [
"Pipeline"
],
"torch.distributed.pipeline.sync.skip.layout": [
"SkipLayout",
"inspect_skip_layout"
],
"torch.distributed.pipeline.sync.skip.portal": [
"Context",
"Portal",
"PortalBlue",
"PortalCopy",
"PortalOrange"
],
"torch.distributed.pipeline.sync.skip.skippable": [
"Skippable"
],
"torch.distributed.pipeline.sync.skip.tracker": [
"SkipTracker",
"SkipTrackerThroughPotals",
"ThreadLocal",
"current_skip_tracker",
"use_skip_tracker"
],
"torch.distributed.remote_device": [
"Optional",
"Union"
@ -1297,10 +1321,12 @@
"_weight_norm_interface",
"autocast",
"broadcast_shapes",
"candidate",
"compiled_with_cxx11_abi",
"from_dlpack",
"lobpcg",
"lu",
"obj",
"segment_reduce",
"set_default_dtype",
"set_grad_enabled",
@ -1671,6 +1697,10 @@
"get_args_parser",
"run"
],
"torch.distributed.pipeline.sync": [
"NoChunk",
"WithDevice"
],
"torch.distributed.rpc.rref_proxy": [
"Future",
"partial",
@ -2604,12 +2634,32 @@
"TensorPipeRpcBackendOptions"
],
"torch.distributed.pipelining": [
"ArgsChunkSpec",
"KwargsChunkSpec",
"Pipe",
"PipelineStage",
"SplitPoint",
"annotate_split_points",
"pipe_split",
"pipeline"
],
"torch.distributed.pipelining.PipelineSchedule": [
"ABC",
"Any",
"Callable",
"Dict",
"List",
"Optional",
"Pipe",
"PipelineStageBase",
"Tuple",
"Union",
"abstractmethod",
"defaultdict",
"merge_chunks",
"record_function",
"split_args_kwargs_into_chunks"
],
"torch.distributed.pipelining.microbatch": [
"Any",
"Dict",

View File

@ -7,9 +7,10 @@ import sys
import xml.etree.ElementTree as ET
from collections import defaultdict
from types import MethodType
from typing import Any, List, Optional, TYPE_CHECKING, Union
from typing import Any, List, Optional, Union
import pytest
from _pytest._code.code import ReprFileLocation
from _pytest.config import Config, filename_arg
from _pytest.config.argparsing import Parser
from _pytest.junitxml import _NodeReporter, bin_xml_escape, LogXML
@ -19,9 +20,6 @@ from _pytest.stash import StashKey
from _pytest.terminal import _get_raw_skip_reason
from pytest_shard_custom import pytest_addoptions as shard_addoptions, PytestShardPlugin
if TYPE_CHECKING:
from _pytest._code.code import ReprFileLocation
# a lot of this file is copied from _pytest.junitxml and modified to get rerun info
xml_key = StashKey["LogXMLReruns"]()

View File

@ -12,7 +12,7 @@ from torch.distributed._spmd.parallel_mode import DataParallel
from torch.distributed._tensor import Replicate
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
from torch.testing._internal.common_utils import run_tests # noqa: TCH001
from torch.testing._internal.common_utils import run_tests
from torch.testing._internal.distributed._tensor.common_dtensor import (
DTensorTestBase,
with_comms,

View File

@ -2,7 +2,7 @@
import os
from torch.distributed._spmd.graph_utils import dump_graphs_to_files
from torch.testing._internal.common_utils import run_tests # noqa: TCH001
from torch.testing._internal.common_utils import run_tests
from torch.testing._internal.distributed._tensor.common_dtensor import DTensorTestBase

View File

@ -20,7 +20,7 @@ from torch.fx.experimental.proxy_tensor import make_fx
from torch.nn import functional as F
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
from torch.testing._internal.common_utils import run_tests # noqa: TCH001
from torch.testing._internal.common_utils import run_tests
from torch.testing._internal.distributed._tensor.common_dtensor import (
DTensorTestBase,
with_comms as base_with_comms,
@ -46,7 +46,7 @@ class TraceDeviceMeshTestBase:
local_tensor = torch.ones(3, 3, device=self.device_type) * self.rank
# check all dim groups
dim_to_subgroups = mesh.get_all_groups()
dim_to_subgroups = mesh.get_group()
for dim, dim_group in enumerate(dim_to_subgroups):
dim_group_size = get_world_size(dim_group)
global_ranks = [

View File

@ -14,13 +14,7 @@ from torch.distributed._tensor import (
init_device_mesh,
)
from torch.distributed._tensor.debug import CommDebugMode
from torch.distributed._tensor.placement_types import (
DTensorSpec,
Partial,
Replicate,
Shard,
TensorMeta,
)
from torch.distributed._tensor.placement_types import Partial, Replicate, Shard
from torch.distributed.tensor.parallel import (
ColwiseParallel,
parallelize_module,
@ -61,29 +55,27 @@ class DTensorTest(DTensorTestBase):
device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
placements = [Shard(0)]
local_tensor = torch.randn(3, 3, requires_grad=True)
spec = DTensorSpec(
device_mesh,
tuple(placements),
tensor_meta=TensorMeta(
torch.Size([self.world_size * 3, 3]),
local_tensor.stride(),
local_tensor.dtype,
),
)
dist_tensor_shape = torch.Size([self.world_size * 3, 3])
dist_tensor = DTensor(
local_tensor,
spec,
device_mesh,
placements,
shape=dist_tensor_shape,
dtype=local_tensor.dtype,
requires_grad=True,
stride=local_tensor.stride(),
)
self.assertEqual(dist_tensor.size(), torch.Size((self.world_size * 3, 3)))
with self.assertWarnsRegex(UserWarning, "To construct"):
DTensor(
local_tensor,
spec,
device_mesh,
placements,
shape=dist_tensor_shape,
dtype=local_tensor.dtype,
requires_grad=False,
stride=local_tensor.stride(),
)
@with_comms
@ -280,23 +272,19 @@ class DTensorTest(DTensorTestBase):
def test_to_local(self):
device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
placements = (Shard(0),)
dist_tensor_shape = torch.Size([self.world_size * 3, 3])
local_tensor_with_grad = torch.randn(
3, 3, device=self.device_type, requires_grad=True
)
dist_tensor_shape = torch.Size([self.world_size * 3, 3])
spec = DTensorSpec(
mesh=device_mesh,
placements=placements,
tensor_meta=TensorMeta(
dist_tensor_shape,
local_tensor_with_grad.stride(),
local_tensor_with_grad.dtype,
),
)
sharded_tensor = DTensor(
local_tensor_with_grad,
spec,
device_mesh,
placements,
shape=dist_tensor_shape,
dtype=local_tensor_with_grad.dtype,
requires_grad=True,
stride=local_tensor_with_grad.stride(),
)
self.assertEqual(sharded_tensor.size(), dist_tensor_shape)
self.assertEqual(sharded_tensor.to_local(), local_tensor_with_grad)
@ -331,11 +319,6 @@ class DTensorTest(DTensorTestBase):
except RuntimeError:
self.assertEqual(sharded_tensor.grad.stride(), [1, 3 * self.world_size])
# test the case under no-grad we directly return the local tensor
with torch.no_grad():
local_no_grad = sharded_tensor.to_local()
assert local_no_grad is sharded_tensor._local_tensor
@with_comms
def test_to_local_grad_hint(self):
device_mesh = DeviceMesh(self.device_type, list(range(self.world_size)))

View File

@ -21,7 +21,6 @@ from torch.distributed._tensor import (
Replicate,
Shard,
)
from torch.distributed._tensor.placement_types import DTensorSpec, TensorMeta
from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
checkpoint_wrapper,
CheckpointImpl,
@ -194,45 +193,41 @@ class TestDTensorCompile(torch._dynamo.test_case.TestCase):
def test_dtensor_constructor_w_graph_break(self):
mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
x = torch.randn(64, 32, requires_grad=True)
spec = DTensorSpec(
mesh,
(Replicate(), Shard(0)),
tensor_meta=TensorMeta(
shape=torch.Size([128, 32]), stride=(32, 1), dtype=x.dtype
),
)
# test passing in DTensor as inputs/outputs and run some tensor computation
def fn(x):
print("graph break!")
return DTensor(
x,
spec,
mesh,
(Replicate(), Shard(0)),
shape=[128, 32],
dtype=x.dtype,
requires_grad=x.requires_grad,
stride=[32, 1],
)
x = torch.randn(64, 32, requires_grad=True)
out = fn(x)
out2 = torch.compile(fn, backend="eager")(x)
def test_dtensor_constructor_w_dynamo_disable(self):
mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
x = torch.randn(32, requires_grad=True)
spec = DTensorSpec(
mesh,
(Replicate(),),
tensor_meta=TensorMeta(shape=torch.Size([32]), stride=(1,), dtype=x.dtype),
)
@torch._dynamo.disable(recursive=False)
def fn(x):
print("foo")
return DTensor(
x,
spec,
mesh,
(Replicate(),),
shape=torch.Size([32]),
dtype=x.dtype,
requires_grad=x.requires_grad,
stride=(1,),
)
x = torch.randn(32, requires_grad=True)
out = fn(x)
out2 = torch.compile(fn, backend="eager")(x)
self.assertEqual(out, out2)

View File

@ -10,12 +10,7 @@ from torch.distributed._tensor._utils import (
)
from torch.distributed._tensor.debug import CommDebugMode
from torch.distributed._tensor.placement_types import (
DTensorSpec,
Replicate,
Shard,
TensorMeta,
)
from torch.distributed._tensor.placement_types import Replicate, Shard
from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
from torch.testing._internal.common_utils import run_tests
@ -190,20 +185,14 @@ class Test2DStridedLocalShard(DTensorTestBase):
chunks = list(torch.chunk(dtensor_tp.to_local(), 2, dim=0))
shard_rank = 0 if self.rank // 2 == 0 else 1
sharded_param = chunks[shard_rank]
spec_2d = DTensorSpec(
mesh=mesh_2d,
placements=(Shard(0), Shard(0)),
tensor_meta=TensorMeta(
global_tensor.size(),
global_tensor.stride(),
global_tensor.dtype,
),
)
dtensor_2d = DTensor(
sharded_param,
spec_2d,
mesh_2d,
[Shard(0), Shard(0)],
shape=global_tensor.size(),
dtype=global_tensor.dtype,
requires_grad=False,
stride=global_tensor.stride(),
)
self.assertEqual(

View File

@ -1,16 +1,13 @@
# Owner(s): ["oncall: distributed"]
from collections import OrderedDict
from typing import TYPE_CHECKING
import torch
import torch.distributed.checkpoint._traverse as _traverse
from torch.distributed.checkpoint.metadata import STATE_DICT_TYPE
from torch.testing._internal.common_utils import run_tests, TestCase
if TYPE_CHECKING:
from torch.distributed.checkpoint.metadata import STATE_DICT_TYPE
# TODO: add comments for TestTraverse
class TestTraverse(TestCase):

Some files were not shown because too many files have changed in this diff Show More