mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-20 02:24:54 +08:00
Compare commits
29 Commits
ciflow/vll
...
update_sub
| Author | SHA1 | Date | |
|---|---|---|---|
| 8a5c2f0432 | |||
| 7a963ffc0b | |||
| 9f94c7b8ee | |||
| e5a766ece4 | |||
| a5f36a8fda | |||
| 1c0bf2a0bb | |||
| 9abc9aac38 | |||
| 789240bae2 | |||
| f49833de54 | |||
| 28c7602c90 | |||
| 6fc430644b | |||
| d48cae96a6 | |||
| 65f08eeec1 | |||
| 13ec55d15b | |||
| cea86781f2 | |||
| cdca10b275 | |||
| b8a3165d28 | |||
| 8f161997b1 | |||
| c8d790b56d | |||
| 878757cb66 | |||
| a369a56726 | |||
| a4e0720fe2 | |||
| 1efc14a50d | |||
| dc4f3c7505 | |||
| e8970ba010 | |||
| 41999a579d | |||
| ebb2001a48 | |||
| ae85307512 | |||
| 7921c0eb0e |
@ -125,10 +125,10 @@ case "$tag" in
|
|||||||
UCC_COMMIT=${_UCC_COMMIT}
|
UCC_COMMIT=${_UCC_COMMIT}
|
||||||
TRITON=yes
|
TRITON=yes
|
||||||
;;
|
;;
|
||||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
|
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks)
|
||||||
CUDA_VERSION=12.8.1
|
CUDA_VERSION=12.8.1
|
||||||
ANACONDA_PYTHON_VERSION=3.10
|
ANACONDA_PYTHON_VERSION=3.10
|
||||||
GCC_VERSION=9
|
GCC_VERSION=11
|
||||||
VISION=yes
|
VISION=yes
|
||||||
KATEX=yes
|
KATEX=yes
|
||||||
UCX_COMMIT=${_UCX_COMMIT}
|
UCX_COMMIT=${_UCX_COMMIT}
|
||||||
@ -146,16 +146,6 @@ case "$tag" in
|
|||||||
UCC_COMMIT=${_UCC_COMMIT}
|
UCC_COMMIT=${_UCC_COMMIT}
|
||||||
TRITON=yes
|
TRITON=yes
|
||||||
;;
|
;;
|
||||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9)
|
|
||||||
CUDA_VERSION=12.8.1
|
|
||||||
ANACONDA_PYTHON_VERSION=3.10
|
|
||||||
GCC_VERSION=9
|
|
||||||
VISION=yes
|
|
||||||
KATEX=yes
|
|
||||||
UCX_COMMIT=${_UCX_COMMIT}
|
|
||||||
UCC_COMMIT=${_UCC_COMMIT}
|
|
||||||
TRITON=yes
|
|
||||||
;;
|
|
||||||
pytorch-linux-jammy-py3-clang12-onnx)
|
pytorch-linux-jammy-py3-clang12-onnx)
|
||||||
ANACONDA_PYTHON_VERSION=3.10
|
ANACONDA_PYTHON_VERSION=3.10
|
||||||
CLANG_VERSION=12
|
CLANG_VERSION=12
|
||||||
|
|||||||
2
.github/ci_commit_pins/vision.txt
vendored
2
.github/ci_commit_pins/vision.txt
vendored
@ -1 +1 @@
|
|||||||
2d82dc5caa336d179d9b46ac4a0fb8c43d84c5cc
|
617079d944b0e72632311c30ae2bbdf1168b901e
|
||||||
|
|||||||
@ -50,6 +50,7 @@ CUDA_AARCH64_ARCHES = ["12.6-aarch64", "12.8-aarch64", "12.9-aarch64", "13.0-aar
|
|||||||
|
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||||
"12.6": (
|
"12.6": (
|
||||||
|
"cuda-bindings==12.9.4; platform_system == 'Linux' | "
|
||||||
"nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | "
|
"nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | "
|
||||||
"nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | "
|
"nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | "
|
||||||
"nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | "
|
"nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | "
|
||||||
@ -67,6 +68,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
|||||||
"nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'"
|
"nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'"
|
||||||
),
|
),
|
||||||
"12.8": (
|
"12.8": (
|
||||||
|
"cuda-bindings==12.9.4; platform_system == 'Linux' | "
|
||||||
"nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | "
|
"nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | "
|
||||||
"nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | "
|
"nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | "
|
||||||
"nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | "
|
"nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | "
|
||||||
@ -84,6 +86,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
|||||||
"nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'"
|
"nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'"
|
||||||
),
|
),
|
||||||
"12.9": (
|
"12.9": (
|
||||||
|
"cuda-bindings==12.9.4; platform_system == 'Linux' | "
|
||||||
"nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | "
|
"nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | "
|
||||||
"nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | "
|
"nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | "
|
||||||
"nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | "
|
"nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | "
|
||||||
@ -101,6 +104,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
|||||||
"nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'"
|
"nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'"
|
||||||
),
|
),
|
||||||
"13.0": (
|
"13.0": (
|
||||||
|
"cuda-bindings==13.0.3; platform_system == 'Linux' | "
|
||||||
"nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | "
|
"nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | "
|
||||||
"nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | "
|
"nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | "
|
||||||
"nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | "
|
"nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | "
|
||||||
|
|||||||
@ -23,7 +23,7 @@ jobs:
|
|||||||
uses: ./.github/workflows/_linux-build.yml
|
uses: ./.github/workflows/_linux-build.yml
|
||||||
with:
|
with:
|
||||||
runner: linux.12xlarge.memory
|
runner: linux.12xlarge.memory
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||||
cuda-arch-list: '8.0 9.0'
|
cuda-arch-list: '8.0 9.0'
|
||||||
test-matrix: |
|
test-matrix: |
|
||||||
@ -39,7 +39,7 @@ jobs:
|
|||||||
needs: attn-microbenchmark-build
|
needs: attn-microbenchmark-build
|
||||||
with:
|
with:
|
||||||
timeout-minutes: 500
|
timeout-minutes: 500
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||||
docker-image: ${{ needs.attn-microbenchmark-build.outputs.docker-image }}
|
docker-image: ${{ needs.attn-microbenchmark-build.outputs.docker-image }}
|
||||||
test-matrix: ${{ needs.attn-microbenchmark-build.outputs.test-matrix }}
|
test-matrix: ${{ needs.attn-microbenchmark-build.outputs.test-matrix }}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
@ -51,7 +51,7 @@ jobs:
|
|||||||
uses: ./.github/workflows/_linux-build.yml
|
uses: ./.github/workflows/_linux-build.yml
|
||||||
with:
|
with:
|
||||||
runner: linux.12xlarge.memory
|
runner: linux.12xlarge.memory
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
|
||||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||||
cuda-arch-list: '10.0'
|
cuda-arch-list: '10.0'
|
||||||
test-matrix: |
|
test-matrix: |
|
||||||
@ -66,7 +66,7 @@ jobs:
|
|||||||
needs: opmicrobenchmark-build-b200
|
needs: opmicrobenchmark-build-b200
|
||||||
with:
|
with:
|
||||||
timeout-minutes: 500
|
timeout-minutes: 500
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
|
||||||
docker-image: ${{ needs.opmicrobenchmark-build-b200.outputs.docker-image }}
|
docker-image: ${{ needs.opmicrobenchmark-build-b200.outputs.docker-image }}
|
||||||
test-matrix: ${{ needs.opmicrobenchmark-build-b200.outputs.test-matrix }}
|
test-matrix: ${{ needs.opmicrobenchmark-build-b200.outputs.test-matrix }}
|
||||||
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||||
|
|||||||
3
.github/workflows/docker-builds.yml
vendored
3
.github/workflows/docker-builds.yml
vendored
@ -52,8 +52,7 @@ jobs:
|
|||||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
|
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
|
||||||
pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11,
|
pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11,
|
||||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
|
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
|
||||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
|
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks,
|
||||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
|
|
||||||
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
|
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
|
||||||
pytorch-linux-jammy-py3.10-clang12,
|
pytorch-linux-jammy-py3.10-clang12,
|
||||||
pytorch-linux-jammy-py3.11-clang12,
|
pytorch-linux-jammy-py3.11-clang12,
|
||||||
|
|||||||
10
.github/workflows/docker-cache-rocm.yml
vendored
10
.github/workflows/docker-cache-rocm.yml
vendored
@ -6,10 +6,9 @@ on:
|
|||||||
branches: [main, release]
|
branches: [main, release]
|
||||||
types:
|
types:
|
||||||
- completed
|
- completed
|
||||||
workflow_dispatch:
|
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }}
|
group: ${{ github.workflow }}-${{ github.event.workflow_run.head_branch }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
|
||||||
permissions:
|
permissions:
|
||||||
@ -50,9 +49,10 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
runner: [linux.rocm.gfx942.docker-cache]
|
runner: [linux.rocm.gfx942.docker-cache]
|
||||||
docker-image: [
|
docker-image: [
|
||||||
"${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3 }}",
|
"${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3 }}"
|
||||||
"${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-noble-rocm-n-py3 }}",
|
#"${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3 }}",
|
||||||
"${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3-benchmarks }}"
|
#"${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-noble-rocm-n-py3 }}",
|
||||||
|
#"${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3-benchmarks }}"
|
||||||
]
|
]
|
||||||
runs-on: "${{ matrix.runner }}"
|
runs-on: "${{ matrix.runner }}"
|
||||||
steps:
|
steps:
|
||||||
|
|||||||
56
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
56
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
@ -132,7 +132,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_10-cuda-aarch64-12_6
|
build_name: manywheel-py3_10-cuda-aarch64-12_6
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -178,7 +178,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_10-cuda-aarch64-12_8
|
build_name: manywheel-py3_10-cuda-aarch64-12_8
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -224,7 +224,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_10-cuda-aarch64-12_9
|
build_name: manywheel-py3_10-cuda-aarch64-12_9
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -270,7 +270,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_10-cuda-aarch64-13_0
|
build_name: manywheel-py3_10-cuda-aarch64-13_0
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -381,7 +381,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_11-cuda-aarch64-12_6
|
build_name: manywheel-py3_11-cuda-aarch64-12_6
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -427,7 +427,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_11-cuda-aarch64-12_8
|
build_name: manywheel-py3_11-cuda-aarch64-12_8
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -473,7 +473,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_11-cuda-aarch64-12_9
|
build_name: manywheel-py3_11-cuda-aarch64-12_9
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -519,7 +519,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_11-cuda-aarch64-13_0
|
build_name: manywheel-py3_11-cuda-aarch64-13_0
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -630,7 +630,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_12-cuda-aarch64-12_6
|
build_name: manywheel-py3_12-cuda-aarch64-12_6
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -676,7 +676,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_12-cuda-aarch64-12_8
|
build_name: manywheel-py3_12-cuda-aarch64-12_8
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -722,7 +722,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_12-cuda-aarch64-12_9
|
build_name: manywheel-py3_12-cuda-aarch64-12_9
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -768,7 +768,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_12-cuda-aarch64-13_0
|
build_name: manywheel-py3_12-cuda-aarch64-13_0
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -879,7 +879,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_13-cuda-aarch64-12_6
|
build_name: manywheel-py3_13-cuda-aarch64-12_6
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -925,7 +925,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_13-cuda-aarch64-12_8
|
build_name: manywheel-py3_13-cuda-aarch64-12_8
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -971,7 +971,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_13-cuda-aarch64-12_9
|
build_name: manywheel-py3_13-cuda-aarch64-12_9
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -1017,7 +1017,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_13-cuda-aarch64-13_0
|
build_name: manywheel-py3_13-cuda-aarch64-13_0
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -1128,7 +1128,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_13t-cuda-aarch64-12_6
|
build_name: manywheel-py3_13t-cuda-aarch64-12_6
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -1174,7 +1174,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_13t-cuda-aarch64-12_8
|
build_name: manywheel-py3_13t-cuda-aarch64-12_8
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -1220,7 +1220,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_13t-cuda-aarch64-12_9
|
build_name: manywheel-py3_13t-cuda-aarch64-12_9
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -1266,7 +1266,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_13t-cuda-aarch64-13_0
|
build_name: manywheel-py3_13t-cuda-aarch64-13_0
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -1377,7 +1377,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_14-cuda-aarch64-12_6
|
build_name: manywheel-py3_14-cuda-aarch64-12_6
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -1423,7 +1423,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_14-cuda-aarch64-12_8
|
build_name: manywheel-py3_14-cuda-aarch64-12_8
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -1469,7 +1469,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_14-cuda-aarch64-12_9
|
build_name: manywheel-py3_14-cuda-aarch64-12_9
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -1515,7 +1515,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_14-cuda-aarch64-13_0
|
build_name: manywheel-py3_14-cuda-aarch64-13_0
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -1626,7 +1626,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_14t-cuda-aarch64-12_6
|
build_name: manywheel-py3_14t-cuda-aarch64-12_6
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -1672,7 +1672,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_14t-cuda-aarch64-12_8
|
build_name: manywheel-py3_14t-cuda-aarch64-12_8
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -1718,7 +1718,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_14t-cuda-aarch64-12_9
|
build_name: manywheel-py3_14t-cuda-aarch64-12_9
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
@ -1764,7 +1764,7 @@ jobs:
|
|||||||
ALPINE_IMAGE: "arm64v8/alpine"
|
ALPINE_IMAGE: "arm64v8/alpine"
|
||||||
build_name: manywheel-py3_14t-cuda-aarch64-13_0
|
build_name: manywheel-py3_14t-cuda-aarch64-13_0
|
||||||
build_environment: linux-aarch64-binary-manywheel
|
build_environment: linux-aarch64-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||||
timeout-minutes: 420
|
timeout-minutes: 420
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|||||||
56
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
56
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
@ -127,7 +127,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_10-cuda12_6
|
build_name: manywheel-py3_10-cuda12_6
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_10-cuda12_6-test: # Testing
|
manywheel-py3_10-cuda12_6-test: # Testing
|
||||||
@ -193,7 +193,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_10-cuda12_8
|
build_name: manywheel-py3_10-cuda12_8
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_10-cuda12_8-test: # Testing
|
manywheel-py3_10-cuda12_8-test: # Testing
|
||||||
@ -259,7 +259,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_10-cuda12_9
|
build_name: manywheel-py3_10-cuda12_9
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_10-cuda12_9-test: # Testing
|
manywheel-py3_10-cuda12_9-test: # Testing
|
||||||
@ -325,7 +325,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_10-cuda13_0
|
build_name: manywheel-py3_10-cuda13_0
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_10-cuda13_0-test: # Testing
|
manywheel-py3_10-cuda13_0-test: # Testing
|
||||||
@ -793,7 +793,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_11-cuda12_6
|
build_name: manywheel-py3_11-cuda12_6
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_11-cuda12_6-test: # Testing
|
manywheel-py3_11-cuda12_6-test: # Testing
|
||||||
@ -859,7 +859,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_11-cuda12_8
|
build_name: manywheel-py3_11-cuda12_8
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_11-cuda12_8-test: # Testing
|
manywheel-py3_11-cuda12_8-test: # Testing
|
||||||
@ -925,7 +925,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_11-cuda12_9
|
build_name: manywheel-py3_11-cuda12_9
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_11-cuda12_9-test: # Testing
|
manywheel-py3_11-cuda12_9-test: # Testing
|
||||||
@ -991,7 +991,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_11-cuda13_0
|
build_name: manywheel-py3_11-cuda13_0
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_11-cuda13_0-test: # Testing
|
manywheel-py3_11-cuda13_0-test: # Testing
|
||||||
@ -1459,7 +1459,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_12-cuda12_6
|
build_name: manywheel-py3_12-cuda12_6
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_12-cuda12_6-test: # Testing
|
manywheel-py3_12-cuda12_6-test: # Testing
|
||||||
@ -1525,7 +1525,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_12-cuda12_8
|
build_name: manywheel-py3_12-cuda12_8
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_12-cuda12_8-test: # Testing
|
manywheel-py3_12-cuda12_8-test: # Testing
|
||||||
@ -1591,7 +1591,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_12-cuda12_9
|
build_name: manywheel-py3_12-cuda12_9
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_12-cuda12_9-test: # Testing
|
manywheel-py3_12-cuda12_9-test: # Testing
|
||||||
@ -1657,7 +1657,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_12-cuda13_0
|
build_name: manywheel-py3_12-cuda13_0
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_12-cuda13_0-test: # Testing
|
manywheel-py3_12-cuda13_0-test: # Testing
|
||||||
@ -2125,7 +2125,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_13-cuda12_6
|
build_name: manywheel-py3_13-cuda12_6
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_13-cuda12_6-test: # Testing
|
manywheel-py3_13-cuda12_6-test: # Testing
|
||||||
@ -2191,7 +2191,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_13-cuda12_8
|
build_name: manywheel-py3_13-cuda12_8
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_13-cuda12_8-test: # Testing
|
manywheel-py3_13-cuda12_8-test: # Testing
|
||||||
@ -2257,7 +2257,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_13-cuda12_9
|
build_name: manywheel-py3_13-cuda12_9
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_13-cuda12_9-test: # Testing
|
manywheel-py3_13-cuda12_9-test: # Testing
|
||||||
@ -2323,7 +2323,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_13-cuda13_0
|
build_name: manywheel-py3_13-cuda13_0
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_13-cuda13_0-test: # Testing
|
manywheel-py3_13-cuda13_0-test: # Testing
|
||||||
@ -2791,7 +2791,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_13t-cuda12_6
|
build_name: manywheel-py3_13t-cuda12_6
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_13t-cuda12_6-test: # Testing
|
manywheel-py3_13t-cuda12_6-test: # Testing
|
||||||
@ -2857,7 +2857,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_13t-cuda12_8
|
build_name: manywheel-py3_13t-cuda12_8
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_13t-cuda12_8-test: # Testing
|
manywheel-py3_13t-cuda12_8-test: # Testing
|
||||||
@ -2923,7 +2923,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_13t-cuda12_9
|
build_name: manywheel-py3_13t-cuda12_9
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_13t-cuda12_9-test: # Testing
|
manywheel-py3_13t-cuda12_9-test: # Testing
|
||||||
@ -2989,7 +2989,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_13t-cuda13_0
|
build_name: manywheel-py3_13t-cuda13_0
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_13t-cuda13_0-test: # Testing
|
manywheel-py3_13t-cuda13_0-test: # Testing
|
||||||
@ -3457,7 +3457,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_14-cuda12_6
|
build_name: manywheel-py3_14-cuda12_6
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_14-cuda12_6-test: # Testing
|
manywheel-py3_14-cuda12_6-test: # Testing
|
||||||
@ -3523,7 +3523,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_14-cuda12_8
|
build_name: manywheel-py3_14-cuda12_8
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_14-cuda12_8-test: # Testing
|
manywheel-py3_14-cuda12_8-test: # Testing
|
||||||
@ -3589,7 +3589,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_14-cuda12_9
|
build_name: manywheel-py3_14-cuda12_9
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_14-cuda12_9-test: # Testing
|
manywheel-py3_14-cuda12_9-test: # Testing
|
||||||
@ -3655,7 +3655,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_14-cuda13_0
|
build_name: manywheel-py3_14-cuda13_0
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_14-cuda13_0-test: # Testing
|
manywheel-py3_14-cuda13_0-test: # Testing
|
||||||
@ -4123,7 +4123,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_14t-cuda12_6
|
build_name: manywheel-py3_14t-cuda12_6
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_14t-cuda12_6-test: # Testing
|
manywheel-py3_14t-cuda12_6-test: # Testing
|
||||||
@ -4189,7 +4189,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_14t-cuda12_8
|
build_name: manywheel-py3_14t-cuda12_8
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_14t-cuda12_8-test: # Testing
|
manywheel-py3_14t-cuda12_8-test: # Testing
|
||||||
@ -4255,7 +4255,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_14t-cuda12_9
|
build_name: manywheel-py3_14t-cuda12_9
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==12.9.4; platform_system == 'Linux' | nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_14t-cuda12_9-test: # Testing
|
manywheel-py3_14t-cuda12_9-test: # Testing
|
||||||
@ -4321,7 +4321,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build_name: manywheel-py3_14t-cuda13_0
|
build_name: manywheel-py3_14t-cuda13_0
|
||||||
build_environment: linux-binary-manywheel
|
build_environment: linux-binary-manywheel
|
||||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: cuda-bindings==13.0.3; platform_system == 'Linux' | nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
|
||||||
secrets:
|
secrets:
|
||||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||||
manywheel-py3_14t-cuda13_0-test: # Testing
|
manywheel-py3_14t-cuda13_0-test: # Testing
|
||||||
|
|||||||
10
.github/workflows/inductor-micro-benchmark.yml
vendored
10
.github/workflows/inductor-micro-benchmark.yml
vendored
@ -30,14 +30,14 @@ jobs:
|
|||||||
opt_out_experiments: lf
|
opt_out_experiments: lf
|
||||||
|
|
||||||
build:
|
build:
|
||||||
name: cuda12.8-py3.10-gcc9-sm80
|
name: cuda12.8-py3.10-gcc11-sm80
|
||||||
uses: ./.github/workflows/_linux-build.yml
|
uses: ./.github/workflows/_linux-build.yml
|
||||||
needs:
|
needs:
|
||||||
- get-default-label-prefix
|
- get-default-label-prefix
|
||||||
with:
|
with:
|
||||||
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||||
cuda-arch-list: '8.0'
|
cuda-arch-list: '8.0'
|
||||||
test-matrix: |
|
test-matrix: |
|
||||||
{ include: [
|
{ include: [
|
||||||
@ -46,11 +46,11 @@ jobs:
|
|||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
test:
|
test:
|
||||||
name: cuda12.8-py3.10-gcc9-sm80
|
name: cuda12.8-py3.10-gcc11-sm80
|
||||||
uses: ./.github/workflows/_linux-test.yml
|
uses: ./.github/workflows/_linux-test.yml
|
||||||
needs: build
|
needs: build
|
||||||
with:
|
with:
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||||
timeout-minutes: 720
|
timeout-minutes: 720
|
||||||
|
|||||||
10
.github/workflows/inductor-perf-compare.yml
vendored
10
.github/workflows/inductor-perf-compare.yml
vendored
@ -27,14 +27,14 @@ jobs:
|
|||||||
opt_out_experiments: lf
|
opt_out_experiments: lf
|
||||||
|
|
||||||
build:
|
build:
|
||||||
name: cuda12.8-py3.10-gcc9-sm80
|
name: cuda12.8-py3.10-gcc11-sm80
|
||||||
uses: ./.github/workflows/_linux-build.yml
|
uses: ./.github/workflows/_linux-build.yml
|
||||||
needs:
|
needs:
|
||||||
- get-default-label-prefix
|
- get-default-label-prefix
|
||||||
with:
|
with:
|
||||||
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||||
cuda-arch-list: '8.0'
|
cuda-arch-list: '8.0'
|
||||||
test-matrix: |
|
test-matrix: |
|
||||||
{ include: [
|
{ include: [
|
||||||
@ -47,11 +47,11 @@ jobs:
|
|||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
test:
|
test:
|
||||||
name: cuda12.8-py3.10-gcc9-sm80
|
name: cuda12.8-py3.10-gcc11-sm80
|
||||||
uses: ./.github/workflows/_linux-test.yml
|
uses: ./.github/workflows/_linux-test.yml
|
||||||
needs: build
|
needs: build
|
||||||
with:
|
with:
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||||
# disable monitor in perf tests for more investigation
|
# disable monitor in perf tests for more investigation
|
||||||
|
|||||||
18
.github/workflows/inductor-perf-test-b200.yml
vendored
18
.github/workflows/inductor-perf-test-b200.yml
vendored
@ -80,7 +80,7 @@ jobs:
|
|||||||
opt_out_experiments: lf
|
opt_out_experiments: lf
|
||||||
|
|
||||||
build:
|
build:
|
||||||
name: cuda12.8-py3.10-gcc9-sm100
|
name: cuda12.8-py3.10-gcc11-sm100
|
||||||
uses: ./.github/workflows/_linux-build.yml
|
uses: ./.github/workflows/_linux-build.yml
|
||||||
needs: get-label-type
|
needs: get-label-type
|
||||||
with:
|
with:
|
||||||
@ -90,8 +90,8 @@ jobs:
|
|||||||
# from trunk. Also use a memory-intensive runner here because memory is
|
# from trunk. Also use a memory-intensive runner here because memory is
|
||||||
# usually the bottleneck
|
# usually the bottleneck
|
||||||
runner: linux.12xlarge.memory
|
runner: linux.12xlarge.memory
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
|
||||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||||
cuda-arch-list: '10.0'
|
cuda-arch-list: '10.0'
|
||||||
test-matrix: |
|
test-matrix: |
|
||||||
{ include: [
|
{ include: [
|
||||||
@ -104,12 +104,12 @@ jobs:
|
|||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
test-periodically:
|
test-periodically:
|
||||||
name: cuda12.8-py3.10-gcc9-sm100
|
name: cuda12.8-py3.10-gcc11-sm100
|
||||||
uses: ./.github/workflows/_linux-test.yml
|
uses: ./.github/workflows/_linux-test.yml
|
||||||
needs: build
|
needs: build
|
||||||
if: github.event.schedule == '0 7 * * 1-6'
|
if: github.event.schedule == '0 7 * * 1-6'
|
||||||
with:
|
with:
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
|
||||||
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
|
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
|
||||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||||
@ -121,12 +121,12 @@ jobs:
|
|||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
test-weekly:
|
test-weekly:
|
||||||
name: cuda12.8-py3.10-gcc9-sm100
|
name: cuda12.8-py3.10-gcc11-sm100
|
||||||
uses: ./.github/workflows/_linux-test.yml
|
uses: ./.github/workflows/_linux-test.yml
|
||||||
needs: build
|
needs: build
|
||||||
if: github.event.schedule == '0 7 * * 0'
|
if: github.event.schedule == '0 7 * * 0'
|
||||||
with:
|
with:
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
|
||||||
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
|
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
|
||||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||||
@ -138,11 +138,11 @@ jobs:
|
|||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
test:
|
test:
|
||||||
name: cuda12.8-py3.10-gcc9-sm100
|
name: cuda12.8-py3.10-gcc11-sm100
|
||||||
uses: ./.github/workflows/_linux-test.yml
|
uses: ./.github/workflows/_linux-test.yml
|
||||||
needs: build
|
needs: build
|
||||||
with:
|
with:
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
|
||||||
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
|
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
|
||||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||||
|
|||||||
@ -95,8 +95,8 @@ jobs:
|
|||||||
# from trunk. Also use a memory-intensive runner here because memory is
|
# from trunk. Also use a memory-intensive runner here because memory is
|
||||||
# usually the bottleneck
|
# usually the bottleneck
|
||||||
runner: linux.12xlarge.memory
|
runner: linux.12xlarge.memory
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
|
||||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||||
cuda-arch-list: '9.0'
|
cuda-arch-list: '9.0'
|
||||||
test-matrix: |
|
test-matrix: |
|
||||||
{ include: [
|
{ include: [
|
||||||
@ -132,7 +132,7 @@ jobs:
|
|||||||
needs: build
|
needs: build
|
||||||
if: github.event.schedule == '15 0 * * 1-6'
|
if: github.event.schedule == '15 0 * * 1-6'
|
||||||
with:
|
with:
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
|
||||||
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
|
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
|
||||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||||
@ -149,7 +149,7 @@ jobs:
|
|||||||
needs: build
|
needs: build
|
||||||
if: github.event.schedule == '0 7 * * 0'
|
if: github.event.schedule == '0 7 * * 0'
|
||||||
with:
|
with:
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
|
||||||
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
|
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
|
||||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||||
@ -168,7 +168,7 @@ jobs:
|
|||||||
# needs one round of benchmark
|
# needs one round of benchmark
|
||||||
if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' }}
|
if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' }}
|
||||||
with:
|
with:
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
|
||||||
dashboard-tag: training-${{ inputs.training || 'true' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cudagraphs-${{ inputs.cudagraphs || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'false' }}-aotinductor-${{ inputs.aotinductor || 'false' }}-maxautotune-${{ inputs.maxautotune || 'false' }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs || 'false' }}-cudagraphs_low_precision-${{ inputs.cudagraphs || 'false' }}
|
dashboard-tag: training-${{ inputs.training || 'true' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cudagraphs-${{ inputs.cudagraphs || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'false' }}-aotinductor-${{ inputs.aotinductor || 'false' }}-maxautotune-${{ inputs.maxautotune || 'false' }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs || 'false' }}-cudagraphs_low_precision-${{ inputs.cudagraphs || 'false' }}
|
||||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||||
|
|||||||
18
.github/workflows/inductor-perf-test-nightly.yml
vendored
18
.github/workflows/inductor-perf-test-nightly.yml
vendored
@ -80,15 +80,15 @@ jobs:
|
|||||||
opt_out_experiments: lf
|
opt_out_experiments: lf
|
||||||
|
|
||||||
build:
|
build:
|
||||||
name: cuda12.8-py3.10-gcc9-sm80
|
name: cuda12.8-py3.10-gcc11-sm80
|
||||||
uses: ./.github/workflows/_linux-build.yml
|
uses: ./.github/workflows/_linux-build.yml
|
||||||
needs: get-label-type
|
needs: get-label-type
|
||||||
with:
|
with:
|
||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
# Every bit to make perf run faster helps
|
# Every bit to make perf run faster helps
|
||||||
runner: linux.12xlarge.memory
|
runner: linux.12xlarge.memory
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||||
cuda-arch-list: '8.0'
|
cuda-arch-list: '8.0'
|
||||||
test-matrix: |
|
test-matrix: |
|
||||||
{ include: [
|
{ include: [
|
||||||
@ -117,12 +117,12 @@ jobs:
|
|||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
test-nightly:
|
test-nightly:
|
||||||
name: cuda12.8-py3.10-gcc9-sm80
|
name: cuda12.8-py3.10-gcc11-sm80
|
||||||
uses: ./.github/workflows/_linux-test.yml
|
uses: ./.github/workflows/_linux-test.yml
|
||||||
needs: build
|
needs: build
|
||||||
if: github.event.schedule == '0 7 * * 1-6'
|
if: github.event.schedule == '0 7 * * 1-6'
|
||||||
with:
|
with:
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||||
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
|
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
|
||||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||||
@ -133,12 +133,12 @@ jobs:
|
|||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
test-weekly:
|
test-weekly:
|
||||||
name: cuda12.8-py3.10-gcc9-sm80
|
name: cuda12.8-py3.10-gcc11-sm80
|
||||||
uses: ./.github/workflows/_linux-test.yml
|
uses: ./.github/workflows/_linux-test.yml
|
||||||
needs: build
|
needs: build
|
||||||
if: github.event.schedule == '0 7 * * 0'
|
if: github.event.schedule == '0 7 * * 0'
|
||||||
with:
|
with:
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||||
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
|
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
|
||||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||||
@ -150,12 +150,12 @@ jobs:
|
|||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
test:
|
test:
|
||||||
name: cuda12.8-py3.10-gcc9-sm80
|
name: cuda12.8-py3.10-gcc11-sm80
|
||||||
uses: ./.github/workflows/_linux-test.yml
|
uses: ./.github/workflows/_linux-test.yml
|
||||||
needs: build
|
needs: build
|
||||||
if: github.event_name == 'workflow_dispatch'
|
if: github.event_name == 'workflow_dispatch'
|
||||||
with:
|
with:
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||||
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
|
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
|
||||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||||
|
|||||||
12
.github/workflows/inductor-periodic.yml
vendored
12
.github/workflows/inductor-periodic.yml
vendored
@ -37,8 +37,8 @@ jobs:
|
|||||||
needs: get-default-label-prefix
|
needs: get-default-label-prefix
|
||||||
with:
|
with:
|
||||||
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
|
||||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||||
cuda-arch-list: '8.0;8.6'
|
cuda-arch-list: '8.0;8.6'
|
||||||
test-matrix: |
|
test-matrix: |
|
||||||
{ include: [
|
{ include: [
|
||||||
@ -76,7 +76,7 @@ jobs:
|
|||||||
uses: ./.github/workflows/_linux-test.yml
|
uses: ./.github/workflows/_linux-test.yml
|
||||||
needs: periodic-dynamo-benchmarks-build
|
needs: periodic-dynamo-benchmarks-build
|
||||||
with:
|
with:
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
|
||||||
docker-image: ${{ needs.periodic-dynamo-benchmarks-build.outputs.docker-image }}
|
docker-image: ${{ needs.periodic-dynamo-benchmarks-build.outputs.docker-image }}
|
||||||
test-matrix: ${{ needs.periodic-dynamo-benchmarks-build.outputs.test-matrix }}
|
test-matrix: ${{ needs.periodic-dynamo-benchmarks-build.outputs.test-matrix }}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
@ -138,8 +138,8 @@ jobs:
|
|||||||
- get-default-label-prefix
|
- get-default-label-prefix
|
||||||
with:
|
with:
|
||||||
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||||
cuda-arch-list: '8.0'
|
cuda-arch-list: '8.0'
|
||||||
test-matrix: |
|
test-matrix: |
|
||||||
{ include: [
|
{ include: [
|
||||||
@ -153,7 +153,7 @@ jobs:
|
|||||||
uses: ./.github/workflows/_linux-test.yml
|
uses: ./.github/workflows/_linux-test.yml
|
||||||
needs: inductor-smoke-build
|
needs: inductor-smoke-build
|
||||||
with:
|
with:
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||||
docker-image: ${{ needs.inductor-smoke-build.outputs.docker-image }}
|
docker-image: ${{ needs.inductor-smoke-build.outputs.docker-image }}
|
||||||
test-matrix: ${{ needs.inductor-smoke-build.outputs.test-matrix }}
|
test-matrix: ${{ needs.inductor-smoke-build.outputs.test-matrix }}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|||||||
6
.github/workflows/inductor-unittest.yml
vendored
6
.github/workflows/inductor-unittest.yml
vendored
@ -33,8 +33,8 @@ jobs:
|
|||||||
uses: ./.github/workflows/_linux-build.yml
|
uses: ./.github/workflows/_linux-build.yml
|
||||||
needs: get-label-type
|
needs: get-label-type
|
||||||
with:
|
with:
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
|
||||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||||
cuda-arch-list: '8.6'
|
cuda-arch-list: '8.6'
|
||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
test-matrix: |
|
test-matrix: |
|
||||||
@ -52,7 +52,7 @@ jobs:
|
|||||||
uses: ./.github/workflows/_linux-test.yml
|
uses: ./.github/workflows/_linux-test.yml
|
||||||
needs: inductor-build
|
needs: inductor-build
|
||||||
with:
|
with:
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
|
||||||
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
|
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
|
||||||
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
|
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|||||||
6
.github/workflows/inductor.yml
vendored
6
.github/workflows/inductor.yml
vendored
@ -49,8 +49,8 @@ jobs:
|
|||||||
uses: ./.github/workflows/_linux-build.yml
|
uses: ./.github/workflows/_linux-build.yml
|
||||||
needs: get-label-type
|
needs: get-label-type
|
||||||
with:
|
with:
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
|
||||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||||
cuda-arch-list: '8.6'
|
cuda-arch-list: '8.6'
|
||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
test-matrix: |
|
test-matrix: |
|
||||||
@ -69,7 +69,7 @@ jobs:
|
|||||||
uses: ./.github/workflows/_linux-test.yml
|
uses: ./.github/workflows/_linux-test.yml
|
||||||
needs: inductor-build
|
needs: inductor-build
|
||||||
with:
|
with:
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
|
||||||
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
|
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
|
||||||
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
|
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|||||||
@ -25,7 +25,7 @@ jobs:
|
|||||||
uses: ./.github/workflows/_linux-build.yml
|
uses: ./.github/workflows/_linux-build.yml
|
||||||
with:
|
with:
|
||||||
runner: linux.12xlarge.memory
|
runner: linux.12xlarge.memory
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||||
cuda-arch-list: '8.0 9.0'
|
cuda-arch-list: '8.0 9.0'
|
||||||
test-matrix: |
|
test-matrix: |
|
||||||
@ -41,7 +41,7 @@ jobs:
|
|||||||
needs: opmicrobenchmark-build
|
needs: opmicrobenchmark-build
|
||||||
with:
|
with:
|
||||||
timeout-minutes: 500
|
timeout-minutes: 500
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||||
docker-image: ${{ needs.opmicrobenchmark-build.outputs.docker-image }}
|
docker-image: ${{ needs.opmicrobenchmark-build.outputs.docker-image }}
|
||||||
test-matrix: ${{ needs.opmicrobenchmark-build.outputs.test-matrix }}
|
test-matrix: ${{ needs.opmicrobenchmark-build.outputs.test-matrix }}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
@ -53,7 +53,7 @@ jobs:
|
|||||||
uses: ./.github/workflows/_linux-build.yml
|
uses: ./.github/workflows/_linux-build.yml
|
||||||
with:
|
with:
|
||||||
runner: linux.12xlarge.memory
|
runner: linux.12xlarge.memory
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
|
||||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||||
cuda-arch-list: '10.0'
|
cuda-arch-list: '10.0'
|
||||||
test-matrix: |
|
test-matrix: |
|
||||||
@ -68,7 +68,7 @@ jobs:
|
|||||||
needs: opmicrobenchmark-build-b200
|
needs: opmicrobenchmark-build-b200
|
||||||
with:
|
with:
|
||||||
timeout-minutes: 500
|
timeout-minutes: 500
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
|
||||||
docker-image: ${{ needs.opmicrobenchmark-build-b200.outputs.docker-image }}
|
docker-image: ${{ needs.opmicrobenchmark-build-b200.outputs.docker-image }}
|
||||||
test-matrix: ${{ needs.opmicrobenchmark-build-b200.outputs.test-matrix }}
|
test-matrix: ${{ needs.opmicrobenchmark-build-b200.outputs.test-matrix }}
|
||||||
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||||
|
|||||||
51
.github/workflows/periodic.yml
vendored
51
.github/workflows/periodic.yml
vendored
@ -90,6 +90,7 @@ jobs:
|
|||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11
|
||||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||||
|
cuda-arch-list: 8.6
|
||||||
test-matrix: |
|
test-matrix: |
|
||||||
{ include: [
|
{ include: [
|
||||||
{ config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
|
{ config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
|
||||||
@ -97,7 +98,9 @@ jobs:
|
|||||||
{ config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
|
{ config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
|
||||||
{ config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
|
{ config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
|
||||||
{ config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
|
{ config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
|
||||||
{ config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
{ config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||||
|
{ config: "multigpu", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu", owners: ["oncall:distributed"] },
|
||||||
|
{ config: "multigpu", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu", owners: ["oncall:distributed"] },
|
||||||
]}
|
]}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
@ -113,40 +116,14 @@ jobs:
|
|||||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }}
|
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
linux-jammy-cuda12_8-py3_10-gcc9-build:
|
linux-jammy-cuda12_8-py3_10-gcc11-debug-build:
|
||||||
name: linux-jammy-cuda12.8-py3.10-gcc9
|
name: linux-jammy-cuda12.8-py3.10-gcc11-debug
|
||||||
uses: ./.github/workflows/_linux-build.yml
|
uses: ./.github/workflows/_linux-build.yml
|
||||||
needs: get-label-type
|
needs: get-label-type
|
||||||
with:
|
with:
|
||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-debug
|
||||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9
|
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||||
cuda-arch-list: 8.6
|
|
||||||
test-matrix: |
|
|
||||||
{ include: [
|
|
||||||
{ config: "multigpu", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu", owners: ["oncall:distributed"] },
|
|
||||||
{ config: "multigpu", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu", owners: ["oncall:distributed"] },
|
|
||||||
]}
|
|
||||||
secrets: inherit
|
|
||||||
|
|
||||||
linux-jammy-cuda12_8-py3_10-gcc9-test:
|
|
||||||
name: linux-jammy-cuda12.8-py3.10-gcc9
|
|
||||||
uses: ./.github/workflows/_linux-test.yml
|
|
||||||
needs: linux-jammy-cuda12_8-py3_10-gcc9-build
|
|
||||||
with:
|
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9
|
|
||||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-build.outputs.docker-image }}
|
|
||||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-build.outputs.test-matrix }}
|
|
||||||
secrets: inherit
|
|
||||||
|
|
||||||
linux-jammy-cuda12_8-py3_10-gcc9-debug-build:
|
|
||||||
name: linux-jammy-cuda12.8-py3.10-gcc9-debug
|
|
||||||
uses: ./.github/workflows/_linux-build.yml
|
|
||||||
needs: get-label-type
|
|
||||||
with:
|
|
||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-debug
|
|
||||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9
|
|
||||||
cuda-arch-list: 8.9
|
cuda-arch-list: 8.9
|
||||||
test-matrix: |
|
test-matrix: |
|
||||||
{ include: [
|
{ include: [
|
||||||
@ -160,16 +137,16 @@ jobs:
|
|||||||
]}
|
]}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
linux-jammy-cuda12_8-py3_10-gcc9-debug-test:
|
linux-jammy-cuda12_8-py3_10-gcc11-debug-test:
|
||||||
name: linux-jammy-cuda12.8-py3.10-gcc9-debug
|
name: linux-jammy-cuda12.8-py3.10-gcc11-debug
|
||||||
uses: ./.github/workflows/_linux-test.yml
|
uses: ./.github/workflows/_linux-test.yml
|
||||||
needs:
|
needs:
|
||||||
- linux-jammy-cuda12_8-py3_10-gcc9-debug-build
|
- linux-jammy-cuda12_8-py3_10-gcc11-debug-build
|
||||||
- target-determination
|
- target-determination
|
||||||
with:
|
with:
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-debug
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-debug
|
||||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-debug-build.outputs.docker-image }}
|
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-debug-build.outputs.docker-image }}
|
||||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-debug-build.outputs.test-matrix }}
|
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-debug-build.outputs.test-matrix }}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
linux-jammy-cuda13_0-py3_10-gcc11-build:
|
linux-jammy-cuda13_0-py3_10-gcc11-build:
|
||||||
|
|||||||
20
.github/workflows/pull.yml
vendored
20
.github/workflows/pull.yml
vendored
@ -318,14 +318,14 @@ jobs:
|
|||||||
]}
|
]}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
|
linux-jammy-cuda12_8-py3_10-gcc11-inductor-build:
|
||||||
name: cuda12.8-py3.10-gcc9-sm75
|
name: cuda12.8-py3.10-gcc11-sm75
|
||||||
uses: ./.github/workflows/_linux-build.yml
|
uses: ./.github/workflows/_linux-build.yml
|
||||||
needs: get-label-type
|
needs: get-label-type
|
||||||
with:
|
with:
|
||||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm75
|
||||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||||
cuda-arch-list: '7.5'
|
cuda-arch-list: '7.5'
|
||||||
test-matrix: |
|
test-matrix: |
|
||||||
{ include: [
|
{ include: [
|
||||||
@ -333,14 +333,14 @@ jobs:
|
|||||||
]}
|
]}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
|
linux-jammy-cuda12_8-py3_10-gcc11-inductor-test:
|
||||||
name: cuda12.8-py3.10-gcc9-sm75
|
name: cuda12.8-py3.10-gcc11-sm75
|
||||||
uses: ./.github/workflows/_linux-test.yml
|
uses: ./.github/workflows/_linux-test.yml
|
||||||
needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
|
needs: linux-jammy-cuda12_8-py3_10-gcc11-inductor-build
|
||||||
with:
|
with:
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm75
|
||||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
|
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-inductor-build.outputs.docker-image }}
|
||||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
|
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-inductor-build.outputs.test-matrix }}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
linux-noble-xpu-n-py3_10-build:
|
linux-noble-xpu-n-py3_10-build:
|
||||||
|
|||||||
10
.github/workflows/torchbench.yml
vendored
10
.github/workflows/torchbench.yml
vendored
@ -26,14 +26,14 @@ jobs:
|
|||||||
curr_ref_type: ${{ github.ref_type }}
|
curr_ref_type: ${{ github.ref_type }}
|
||||||
|
|
||||||
build:
|
build:
|
||||||
name: cuda12.8-py3.10-gcc9-sm80
|
name: cuda12.8-py3.10-gcc11-sm80
|
||||||
uses: ./.github/workflows/_linux-build.yml
|
uses: ./.github/workflows/_linux-build.yml
|
||||||
needs:
|
needs:
|
||||||
- get-default-label-prefix
|
- get-default-label-prefix
|
||||||
with:
|
with:
|
||||||
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||||
cuda-arch-list: '8.0'
|
cuda-arch-list: '8.0'
|
||||||
test-matrix: |
|
test-matrix: |
|
||||||
{ include: [
|
{ include: [
|
||||||
@ -42,11 +42,11 @@ jobs:
|
|||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
test:
|
test:
|
||||||
name: cuda12.8-py3.10-gcc9-sm80
|
name: cuda12.8-py3.10-gcc11-sm80
|
||||||
uses: ./.github/workflows/_linux-test.yml
|
uses: ./.github/workflows/_linux-test.yml
|
||||||
needs: build
|
needs: build
|
||||||
with:
|
with:
|
||||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|||||||
4
.github/workflows/trunk.yml
vendored
4
.github/workflows/trunk.yml
vendored
@ -231,8 +231,8 @@ jobs:
|
|||||||
uses: ./.github/workflows/_linux-build.yml
|
uses: ./.github/workflows/_linux-build.yml
|
||||||
needs: get-label-type
|
needs: get-label-type
|
||||||
with:
|
with:
|
||||||
build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm80
|
build-environment: linux-jammy-cuda12.8-py3.12-gcc11-sm80
|
||||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||||
cuda-arch-list: '8.0'
|
cuda-arch-list: '8.0'
|
||||||
secrets: inherit
|
secrets: inherit
|
||||||
|
|
||||||
|
|||||||
@ -10,6 +10,7 @@
|
|||||||
- Do NOT run pre-commit, it is not setup
|
- Do NOT run pre-commit, it is not setup
|
||||||
- To run lint, run 'lintrunner -a' (which will autoapply changes)
|
- To run lint, run 'lintrunner -a' (which will autoapply changes)
|
||||||
- Do NOT attempt to install dependencies, you do not have Internet access
|
- Do NOT attempt to install dependencies, you do not have Internet access
|
||||||
|
- Do NOT create summary files unless explicitly asked
|
||||||
- When you are ready to make a PR, do exactly these steps:
|
- When you are ready to make a PR, do exactly these steps:
|
||||||
- git stash -u
|
- git stash -u
|
||||||
- git reset --hard $(cat /tmp/orig_work.txt) # NB: reset to the LOCAL branch, do NOT fetch
|
- git reset --hard $(cat /tmp/orig_work.txt) # NB: reset to the LOCAL branch, do NOT fetch
|
||||||
|
|||||||
@ -680,7 +680,7 @@ TORCH_API bool elementTypeCanBeInferredFromMembers(const TypePtr& elem_type) {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (elem_type->kind() == AnyType::Kind) {
|
if (elem_type->kind() == AnyType::Kind) {
|
||||||
// List of Any can contains heterogenous types
|
// List of Any can contains heterogeneous types
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
|
|||||||
@ -238,11 +238,18 @@ private:
|
|||||||
}
|
}
|
||||||
|
|
||||||
void moveHelper(CUDAEvent&& other) {
|
void moveHelper(CUDAEvent&& other) {
|
||||||
std::swap(flags_, other.flags_);
|
// Transfer ownership of all state from other to this
|
||||||
std::swap(is_created_, other.is_created_);
|
flags_ = other.flags_;
|
||||||
std::swap(was_recorded_, other.was_recorded_);
|
is_created_ = other.is_created_;
|
||||||
std::swap(device_index_, other.device_index_);
|
was_recorded_ = other.was_recorded_;
|
||||||
std::swap(event_, other.event_);
|
external_ = other.external_;
|
||||||
|
device_index_ = other.device_index_;
|
||||||
|
event_ = other.event_;
|
||||||
|
|
||||||
|
// Reset other to a valid empty state to prevent double-free
|
||||||
|
// The moved-from object must not attempt to destroy the event
|
||||||
|
other.is_created_ = false;
|
||||||
|
other.event_ = cudaEvent_t{};
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@ -813,8 +813,43 @@ void smooth_l1_kernel(TensorIteratorBase& iter, double beta) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void huber_kernel(TensorIterator& iter, double delta) {
|
void huber_kernel(TensorIterator& iter, double delta) {
|
||||||
AT_DISPATCH_FLOATING_TYPES_AND2(
|
// Special-case kHalf: compute in float for numerical stability
|
||||||
kBFloat16, kHalf, iter.dtype(), "huber_cpu", [&]() {
|
if (iter.dtype() == kHalf) {
|
||||||
|
const float delta_val(static_cast<float>(delta));
|
||||||
|
const Vectorized<float> delta_vec(static_cast<float>(delta));
|
||||||
|
const Vectorized<float> point_five_vec(static_cast<float>(0.5));
|
||||||
|
cpu_kernel_vec(
|
||||||
|
iter,
|
||||||
|
// scalar lambda: convert half -> float, compute in float, cast back to half
|
||||||
|
[&delta_val] (at::Half a, at::Half b) -> at::Half {
|
||||||
|
float af = static_cast<float>(a);
|
||||||
|
float bf = static_cast<float>(b);
|
||||||
|
float z = std::abs(af - bf);
|
||||||
|
float out = z < delta_val
|
||||||
|
? 0.5f * z * z
|
||||||
|
: delta_val * (z - 0.5f * delta_val);
|
||||||
|
return static_cast<at::Half>(out);
|
||||||
|
},
|
||||||
|
[&delta_vec, &point_five_vec] (Vectorized<Half> a, Vectorized<Half> b) {
|
||||||
|
auto [a0, a1] = convert_half_float(a);
|
||||||
|
auto [b0, b1] = convert_half_float(b);
|
||||||
|
auto z = (a0 - b0).abs();
|
||||||
|
a0 = Vectorized<float>::blendv(
|
||||||
|
point_five_vec * z * z,
|
||||||
|
delta_vec * (z - point_five_vec * delta_vec),
|
||||||
|
z >= delta_vec);
|
||||||
|
z = (a1 - b1).abs();
|
||||||
|
a1 = Vectorized<float>::blendv(
|
||||||
|
point_five_vec * z * z,
|
||||||
|
delta_vec * (z - point_five_vec * delta_vec),
|
||||||
|
z >= delta_vec);
|
||||||
|
return convert_float_half(a0, a1);
|
||||||
|
}
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16, iter.dtype(), "huber_cpu", [&]() {
|
||||||
using Vec = Vectorized<scalar_t>;
|
using Vec = Vectorized<scalar_t>;
|
||||||
const scalar_t delta_val(delta);
|
const scalar_t delta_val(delta);
|
||||||
const Vec delta_val_vec(delta_val);
|
const Vec delta_val_vec(delta_val);
|
||||||
@ -835,6 +870,7 @@ void huber_kernel(TensorIterator& iter, double delta) {
|
|||||||
z >= delta_val_vec);
|
z >= delta_val_vec);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void sigmoid_backward_kernel(TensorIteratorBase& iter) {
|
void sigmoid_backward_kernel(TensorIteratorBase& iter) {
|
||||||
|
|||||||
@ -147,6 +147,19 @@ class MetalShaderLibrary {
|
|||||||
const std::optional<c10::Scalar> alpha = std::nullopt,
|
const std::optional<c10::Scalar> alpha = std::nullopt,
|
||||||
const std::optional<c10::ScalarType> scalar_arg_type = std::nullopt);
|
const std::optional<c10::ScalarType> scalar_arg_type = std::nullopt);
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void exec_unary_kernel_with_params(
|
||||||
|
TensorIteratorBase& iter,
|
||||||
|
const std::string& name,
|
||||||
|
T params,
|
||||||
|
const std::string& params_type_name);
|
||||||
|
template <typename T>
|
||||||
|
void exec_binary_kernel_with_params(
|
||||||
|
TensorIteratorBase& iter,
|
||||||
|
const std::string& name,
|
||||||
|
T params,
|
||||||
|
const std::string& params_type_name);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
virtual MTLLibrary_t getLibrary();
|
virtual MTLLibrary_t getLibrary();
|
||||||
virtual MTLLibrary_t getLibrary(
|
virtual MTLLibrary_t getLibrary(
|
||||||
|
|||||||
@ -7,10 +7,12 @@
|
|||||||
#include <ATen/Tensor.h>
|
#include <ATen/Tensor.h>
|
||||||
#include <ATen/TensorIterator.h>
|
#include <ATen/TensorIterator.h>
|
||||||
#include <ATen/Utils.h>
|
#include <ATen/Utils.h>
|
||||||
|
#include <ATen/mps/MPSProfiler.h>
|
||||||
#include <ATen/mps/MPSStream.h>
|
#include <ATen/mps/MPSStream.h>
|
||||||
#include <ATen/native/mps/MetalShaderLibrary.h>
|
#include <ATen/native/mps/MetalShaderLibrary.h>
|
||||||
#include <ATen/native/mps/TensorFactory.h>
|
#include <ATen/native/mps/TensorFactory.h>
|
||||||
#include <c10/core/ScalarType.h>
|
#include <c10/core/ScalarType.h>
|
||||||
|
#include <fmt/format.h>
|
||||||
#include <torch/library.h>
|
#include <torch/library.h>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
|
||||||
@ -630,4 +632,147 @@ inline bool needsGather(const TensorBase& t) {
|
|||||||
return !is_macOS_15_0_or_newer && (!t.is_contiguous() || t.storage_offset());
|
return !is_macOS_15_0_or_newer && (!t.is_contiguous() || t.storage_offset());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void MetalShaderLibrary::exec_unary_kernel_with_params(TensorIteratorBase& iter,
|
||||||
|
const std::string& name,
|
||||||
|
T params,
|
||||||
|
const std::string& params_type_name) {
|
||||||
|
using namespace at::mps;
|
||||||
|
// Decompose 64-bit tensor into 32-bit ones
|
||||||
|
if (!iter.can_use_32bit_indexing()) {
|
||||||
|
for (auto&& sub_iter : iter.with_32bit_indexing()) {
|
||||||
|
exec_unary_kernel_with_params(sub_iter, name, params, params_type_name);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto inputTensor = iter.input(0);
|
||||||
|
auto outputTensor = iter.output(0);
|
||||||
|
uint32_t length = iter.numel();
|
||||||
|
if (length == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
auto kernel_name = fmt::format("{}_{}_{}_{}{}",
|
||||||
|
name,
|
||||||
|
iter.is_contiguous() ? "dense" : "strided",
|
||||||
|
scalarToMetalTypeString(outputTensor),
|
||||||
|
scalarToMetalTypeString(inputTensor),
|
||||||
|
fmt::format("_{}", params_type_name));
|
||||||
|
@autoreleasepool {
|
||||||
|
auto cplState = getPipelineStateForFunc(kernel_name);
|
||||||
|
|
||||||
|
MPSStream* mpsStream = getCurrentMPSStream();
|
||||||
|
dispatch_sync(mpsStream->queue(), ^() {
|
||||||
|
auto computeEncoder = mpsStream->commandEncoder();
|
||||||
|
|
||||||
|
getMPSProfiler().beginProfileKernel(cplState, name, {inputTensor});
|
||||||
|
|
||||||
|
[computeEncoder setComputePipelineState:cplState];
|
||||||
|
bind_iter_tensors(computeEncoder, iter);
|
||||||
|
if (!iter.is_contiguous()) {
|
||||||
|
mtl_setArgs<2>(computeEncoder,
|
||||||
|
outputTensor.sizes(),
|
||||||
|
inputTensor.strides(),
|
||||||
|
outputTensor.strides(),
|
||||||
|
inputTensor.ndimension());
|
||||||
|
}
|
||||||
|
detail::mtl_setArg(computeEncoder, params, iter.is_contiguous() ? 2 : 6);
|
||||||
|
mtl_dispatch1DJob(computeEncoder, cplState, length);
|
||||||
|
|
||||||
|
getMPSProfiler().endProfileKernel(cplState);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void MetalShaderLibrary::exec_binary_kernel_with_params(TensorIteratorBase& iter,
|
||||||
|
const std::string& name,
|
||||||
|
T params,
|
||||||
|
const std::string& params_type_name) {
|
||||||
|
using namespace mps;
|
||||||
|
// TODO: Figure a better place to downcast double scalars (probably in tensor iterator itself?)
|
||||||
|
// Right now running something like 1.0-torch.rand(5, device='mps') will create iterator with
|
||||||
|
// double as common dtype (because Python floating point are always 64-bit values)
|
||||||
|
TORCH_CHECK(iter.output().scalar_type() != at::kDouble, "float64 is not supported on MPS");
|
||||||
|
|
||||||
|
// Skip for empty iterators
|
||||||
|
if (iter.numel() == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Decompose 64-bit tensor into 32-bit ones
|
||||||
|
if (!iter.can_use_32bit_indexing()) {
|
||||||
|
for (auto&& sub_iter : iter.with_32bit_indexing()) {
|
||||||
|
exec_binary_kernel_with_params(sub_iter, name, params, params_type_name);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto convert_double_scalar = [](Tensor& t) {
|
||||||
|
if (t.dim() != 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (t.scalar_type() == kDouble) {
|
||||||
|
t = t.to(kFloat);
|
||||||
|
} else if (t.scalar_type() == kComplexDouble) {
|
||||||
|
t = t.to(kComplexFloat);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
Tensor input = iter.input(0);
|
||||||
|
Tensor other = iter.input(1);
|
||||||
|
Tensor out = iter.output();
|
||||||
|
|
||||||
|
convert_double_scalar(input);
|
||||||
|
convert_double_scalar(other);
|
||||||
|
|
||||||
|
MPSStream* mpsStream = getCurrentMPSStream();
|
||||||
|
const auto cast_needed = input.scalar_type() != other.scalar_type();
|
||||||
|
const auto suffix = iter.is_contiguous() ? "dense" : "strided";
|
||||||
|
// TODO: Implicitly pass both input and output types to non-cast kernels
|
||||||
|
const auto kernel_name = cast_needed
|
||||||
|
? fmt::format("{}_{}_cast_{}_{}", name, suffix, scalarToMetalTypeString(out), params_type_name)
|
||||||
|
: fmt::format("{}_{}_{}_{}_{}",
|
||||||
|
name,
|
||||||
|
suffix,
|
||||||
|
scalarToMetalTypeString(out),
|
||||||
|
scalarToMetalTypeString(input),
|
||||||
|
params_type_name);
|
||||||
|
dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
|
||||||
|
@autoreleasepool {
|
||||||
|
auto computeEncoder = mpsStream->commandEncoder();
|
||||||
|
auto binaryPSO = getPipelineStateForFunc(kernel_name);
|
||||||
|
// this function call is a no-op if MPS Profiler is not enabled
|
||||||
|
getMPSProfiler().beginProfileKernel(binaryPSO, kernel_name, {input, other});
|
||||||
|
[computeEncoder setComputePipelineState:binaryPSO];
|
||||||
|
// Set input and output tensors
|
||||||
|
bind_iter_tensors(computeEncoder, iter);
|
||||||
|
// Iterator is contiguous if all of its elements are dense in storage,
|
||||||
|
// i.e. it's true for both row-first and column-first tensors
|
||||||
|
if (iter.is_contiguous()) {
|
||||||
|
detail::mtl_setArg(computeEncoder, params, 3);
|
||||||
|
if (cast_needed) {
|
||||||
|
std::array<int, 4> size_and_types = {static_cast<int>(c10::elementSize(input.scalar_type())),
|
||||||
|
static_cast<int>(c10::elementSize(other.scalar_type())),
|
||||||
|
static_cast<int>(input.scalar_type()),
|
||||||
|
static_cast<int>(other.scalar_type())};
|
||||||
|
mtl_setBytes(computeEncoder, size_and_types, 4);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Please note that shapes and strides of the iterator might be
|
||||||
|
// different than that of its operands, for example binary op
|
||||||
|
// between 4x4 tensor and scalar will result in 1D 16 element iterator
|
||||||
|
std::array<int, 4> ndim_and_types = {iter.ndim(),
|
||||||
|
static_cast<int>(input.scalar_type()),
|
||||||
|
static_cast<int>(other.scalar_type()),
|
||||||
|
static_cast<int>(out.scalar_type())};
|
||||||
|
mtl_setArgs<3>(
|
||||||
|
computeEncoder, params, iter.shape(), iter.strides(0), iter.strides(1), iter.strides(2), ndim_and_types);
|
||||||
|
}
|
||||||
|
mtl_dispatch1DJob(computeEncoder, binaryPSO, iter.numel());
|
||||||
|
getMPSProfiler().endProfileKernel(binaryPSO);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace at::native::mps
|
} // namespace at::native::mps
|
||||||
|
|||||||
16
aten/src/ATen/native/mps/kernels/Activation.h
Normal file
16
aten/src/ATen/native/mps/kernels/Activation.h
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
struct ELUParams {
|
||||||
|
T alpha;
|
||||||
|
T scale;
|
||||||
|
T input_scale;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
struct ELUBackwardParams {
|
||||||
|
T alpha;
|
||||||
|
T scale;
|
||||||
|
T input_scale;
|
||||||
|
bool is_result;
|
||||||
|
};
|
||||||
@ -1,3 +1,4 @@
|
|||||||
|
#include <ATen/native/mps/kernels/Activation.h>
|
||||||
#include <c10/metal/indexing.h>
|
#include <c10/metal/indexing.h>
|
||||||
#include <c10/metal/special_math.h>
|
#include <c10/metal/special_math.h>
|
||||||
#include <metal_stdlib>
|
#include <metal_stdlib>
|
||||||
@ -99,6 +100,59 @@ REGISTER_BINARY_OP(hardswish_backward, float, float);
|
|||||||
REGISTER_BINARY_OP(hardswish_backward, half, half);
|
REGISTER_BINARY_OP(hardswish_backward, half, half);
|
||||||
REGISTER_BINARY_OP(hardswish_backward, bfloat, bfloat);
|
REGISTER_BINARY_OP(hardswish_backward, bfloat, bfloat);
|
||||||
|
|
||||||
|
struct elu_functor {
|
||||||
|
template <typename T>
|
||||||
|
inline T operator()(const T self_, const ELUParams<T> params) {
|
||||||
|
using op_T = opmath_t<T>;
|
||||||
|
auto alpha = static_cast<op_T>(params.alpha);
|
||||||
|
auto scale = static_cast<op_T>(params.scale);
|
||||||
|
auto input_scale = static_cast<op_T>(params.input_scale);
|
||||||
|
auto self = static_cast<op_T>(self_);
|
||||||
|
auto neg_res = alpha * (::metal::precise::exp(self * input_scale) - 1);
|
||||||
|
return static_cast<T>(scale * (self < 0 ? neg_res : self));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct elu_backward_functor {
|
||||||
|
template <typename T>
|
||||||
|
inline T operator()(
|
||||||
|
const T grad_output_,
|
||||||
|
const T self_,
|
||||||
|
ELUBackwardParams<T> params) {
|
||||||
|
using op_T = opmath_t<T>;
|
||||||
|
auto alpha = static_cast<op_T>(params.alpha);
|
||||||
|
auto scale = static_cast<op_T>(params.scale);
|
||||||
|
auto input_scale = static_cast<op_T>(params.input_scale);
|
||||||
|
auto grad_output = static_cast<op_T>(grad_output_);
|
||||||
|
auto self = static_cast<op_T>(self_);
|
||||||
|
|
||||||
|
if (params.is_result) {
|
||||||
|
auto neg_coef = input_scale * (self + alpha * scale);
|
||||||
|
return static_cast<T>(grad_output * (self <= 0 ? neg_coef : scale));
|
||||||
|
} else {
|
||||||
|
auto neg_coef = input_scale * alpha * scale *
|
||||||
|
::metal::precise::exp(self * input_scale);
|
||||||
|
return static_cast<T>(grad_output * (self <= 0 ? neg_coef : scale));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#define REGISTER_ELU_OP(T) \
|
||||||
|
typedef ELUParams<T> ELUParams_##T; \
|
||||||
|
REGISTER_UNARY_ALPHA_OP(elu, T, ELUParams_##T, T);
|
||||||
|
|
||||||
|
REGISTER_ELU_OP(float);
|
||||||
|
REGISTER_ELU_OP(half);
|
||||||
|
REGISTER_ELU_OP(bfloat);
|
||||||
|
|
||||||
|
#define REGISTER_ELU_BACKWARD_OP(T) \
|
||||||
|
typedef ELUBackwardParams<T> ELUBackwardParams_##T; \
|
||||||
|
REGISTER_BINARY_ALPHA_OP(elu_backward, T, ELUBackwardParams_##T, T);
|
||||||
|
|
||||||
|
REGISTER_ELU_BACKWARD_OP(float);
|
||||||
|
REGISTER_ELU_BACKWARD_OP(half);
|
||||||
|
REGISTER_ELU_BACKWARD_OP(bfloat);
|
||||||
|
|
||||||
struct leaky_relu_functor {
|
struct leaky_relu_functor {
|
||||||
template <typename T>
|
template <typename T>
|
||||||
inline T operator()(const T x, const T negative_slope) {
|
inline T operator()(const T x, const T negative_slope) {
|
||||||
|
|||||||
@ -11,8 +11,6 @@
|
|||||||
#include <ATen/ops/_log_softmax_native.h>
|
#include <ATen/ops/_log_softmax_native.h>
|
||||||
#include <ATen/ops/_prelu_kernel_backward_native.h>
|
#include <ATen/ops/_prelu_kernel_backward_native.h>
|
||||||
#include <ATen/ops/_prelu_kernel_native.h>
|
#include <ATen/ops/_prelu_kernel_native.h>
|
||||||
#include <ATen/ops/elu_backward_native.h>
|
|
||||||
#include <ATen/ops/elu_native.h>
|
|
||||||
#include <ATen/ops/gelu_backward_native.h>
|
#include <ATen/ops/gelu_backward_native.h>
|
||||||
#include <ATen/ops/gelu_native.h>
|
#include <ATen/ops/gelu_native.h>
|
||||||
#include <ATen/ops/glu_backward_native.h>
|
#include <ATen/ops/glu_backward_native.h>
|
||||||
@ -119,6 +117,10 @@ Tensor& relu_mps_(Tensor& self) {
|
|||||||
|
|
||||||
TORCH_IMPL_FUNC(log_softmax_mps_out)
|
TORCH_IMPL_FUNC(log_softmax_mps_out)
|
||||||
(const Tensor& self, const int64_t dim, const bool half_to_float, const Tensor& out) {
|
(const Tensor& self, const int64_t dim, const bool half_to_float, const Tensor& out) {
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(self.scalar_type() != kLong, "MPS doesn't know how to do exponent_i64");
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(!c10::isComplexType(self.scalar_type()),
|
||||||
|
"log_softmax for complex is not supported for MPS");
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(self.scalar_type() != kBool, "log_softmax for bool is not supported for MPS");
|
||||||
using namespace mps;
|
using namespace mps;
|
||||||
using CachedGraph = MPSUnaryCachedGraph;
|
using CachedGraph = MPSUnaryCachedGraph;
|
||||||
|
|
||||||
@ -162,6 +164,10 @@ TORCH_IMPL_FUNC(log_softmax_mps_out)
|
|||||||
|
|
||||||
TORCH_IMPL_FUNC(log_softmax_backward_mps_out)
|
TORCH_IMPL_FUNC(log_softmax_backward_mps_out)
|
||||||
(const Tensor& grad_output, const Tensor& output, int64_t dim, ScalarType input_dtype, const Tensor& out) {
|
(const Tensor& grad_output, const Tensor& output, int64_t dim, ScalarType input_dtype, const Tensor& out) {
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(grad_output.scalar_type() != kLong, "MPS doesn't know how to do exponent_i64");
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(!c10::isComplexType(grad_output.scalar_type()),
|
||||||
|
"log_softmax for complex is not supported for MPS");
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(grad_output.scalar_type() != kBool, "log_softmax for bool is not supported for MPS");
|
||||||
using namespace mps;
|
using namespace mps;
|
||||||
using CachedGraph = MPSUnaryGradCachedGraph;
|
using CachedGraph = MPSUnaryGradCachedGraph;
|
||||||
|
|
||||||
@ -202,6 +208,7 @@ TORCH_IMPL_FUNC(log_softmax_backward_mps_out)
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::tuple<Tensor&, Tensor&> log_sigmoid_forward_out_mps(const Tensor& self, Tensor& output, Tensor& buffer) {
|
std::tuple<Tensor&, Tensor&> log_sigmoid_forward_out_mps(const Tensor& self, Tensor& output, Tensor& buffer) {
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(self.scalar_type() != kLong, "MPS doesn't know how to do exponent_i64");
|
||||||
// NOTE: buffer is only used by CPU dispatch, we just ignore it here
|
// NOTE: buffer is only used by CPU dispatch, we just ignore it here
|
||||||
using namespace mps;
|
using namespace mps;
|
||||||
using CachedGraph = MPSUnaryCachedGraph;
|
using CachedGraph = MPSUnaryCachedGraph;
|
||||||
@ -698,194 +705,6 @@ TORCH_IMPL_FUNC(gelu_backward_out_mps)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void elu_variants_out_mps(const Tensor& self,
|
|
||||||
const Scalar& alpha,
|
|
||||||
const Scalar& scale,
|
|
||||||
const Scalar& input_scale,
|
|
||||||
const Tensor& result,
|
|
||||||
std::string func_name) {
|
|
||||||
using namespace mps;
|
|
||||||
using CachedGraph = MPSUnaryCachedGraph;
|
|
||||||
|
|
||||||
auto resultMemFormat = result.suggest_memory_format();
|
|
||||||
bool executeGatherOp = !(self.is_contiguous(resultMemFormat) && result.is_contiguous(resultMemFormat));
|
|
||||||
Tensor out;
|
|
||||||
if (executeGatherOp) {
|
|
||||||
out = at::empty_like(result, MemoryFormat::Contiguous);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Empty output
|
|
||||||
if (result.numel() == 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
MPSStream* stream = getCurrentMPSStream();
|
|
||||||
|
|
||||||
@autoreleasepool {
|
|
||||||
std::string key = func_name + ":" + getTensorsStringKey({self}) + ":" + std::to_string(alpha.to<double>()) + ":" +
|
|
||||||
std::to_string(scale.to<double>()) + ":" + std::to_string(input_scale.to<double>());
|
|
||||||
|
|
||||||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
|
||||||
MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
|
|
||||||
|
|
||||||
// scale * (max(0, x) + min(0, alpha * (exp(input_scale * x) - 1) ))
|
|
||||||
|
|
||||||
MPSGraphTensor* alphaTensor = [mpsGraph constantWithScalar:alpha.to<double>()
|
|
||||||
shape:@[ @1 ]
|
|
||||||
dataType:getMPSDataType(self)];
|
|
||||||
|
|
||||||
MPSGraphTensor* inputScaleTensor = [mpsGraph constantWithScalar:input_scale.to<double>()
|
|
||||||
shape:@[ @1 ]
|
|
||||||
dataType:getMPSDataType(self)];
|
|
||||||
|
|
||||||
MPSGraphTensor* scaleTensor = [mpsGraph constantWithScalar:scale.to<double>()
|
|
||||||
shape:@[ @1 ]
|
|
||||||
dataType:getMPSDataType(self)];
|
|
||||||
MPSGraphTensor* unitTensor = [mpsGraph constantWithScalar:1.0f shape:@[ @1 ] dataType:getMPSDataType(self)];
|
|
||||||
MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0f shape:@[ @1 ] dataType:getMPSDataType(self)];
|
|
||||||
|
|
||||||
MPSGraphTensor* scaledInputTensor = [mpsGraph multiplicationWithPrimaryTensor:inputTensor
|
|
||||||
secondaryTensor:inputScaleTensor
|
|
||||||
name:nil];
|
|
||||||
MPSGraphTensor* exponentTensor = [mpsGraph exponentWithTensor:scaledInputTensor name:nil];
|
|
||||||
MPSGraphTensor* exponentMinusOneTensor = [mpsGraph subtractionWithPrimaryTensor:exponentTensor
|
|
||||||
secondaryTensor:unitTensor
|
|
||||||
name:nil];
|
|
||||||
MPSGraphTensor* alphaTimesTensor = [mpsGraph multiplicationWithPrimaryTensor:exponentMinusOneTensor
|
|
||||||
secondaryTensor:alphaTensor
|
|
||||||
name:nil];
|
|
||||||
MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:inputTensor
|
|
||||||
secondaryTensor:zeroTensor
|
|
||||||
name:nil];
|
|
||||||
MPSGraphTensor* fusedOutput = [mpsGraph selectWithPredicateTensor:predicateTensor
|
|
||||||
truePredicateTensor:inputTensor
|
|
||||||
falsePredicateTensor:alphaTimesTensor
|
|
||||||
name:nil];
|
|
||||||
MPSGraphTensor* outputTensor = [mpsGraph multiplicationWithPrimaryTensor:fusedOutput
|
|
||||||
secondaryTensor:scaleTensor
|
|
||||||
name:nil];
|
|
||||||
|
|
||||||
newCachedGraph->inputTensor_ = inputTensor;
|
|
||||||
newCachedGraph->outputTensor_ = outputTensor;
|
|
||||||
});
|
|
||||||
|
|
||||||
auto selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, nil, executeGatherOp);
|
|
||||||
auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out.has_storage() ? out : result, nil, false);
|
|
||||||
auto feeds = dictionaryFromPlaceholders(selfPlaceholder);
|
|
||||||
runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
|
|
||||||
if (out.has_storage()) {
|
|
||||||
result.copy_(out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// scale * (max(0, x) + min(0, alpha * (exp(input_scale * x) - 1) ))
|
|
||||||
TORCH_IMPL_FUNC(elu_out_mps)
|
|
||||||
(const Tensor& self, const Scalar& alpha, const Scalar& scale, const Scalar& input_scale, const Tensor& result) {
|
|
||||||
elu_variants_out_mps(self, alpha, scale, input_scale, result, "elu_out_mps");
|
|
||||||
}
|
|
||||||
|
|
||||||
TORCH_IMPL_FUNC(elu_backward_out_mps)
|
|
||||||
(const Tensor& grad_output,
|
|
||||||
const Scalar& alpha,
|
|
||||||
const Scalar& scale,
|
|
||||||
const Scalar& input_scale,
|
|
||||||
bool is_result,
|
|
||||||
const Tensor& self_or_result,
|
|
||||||
const Tensor& grad_input) {
|
|
||||||
using namespace mps;
|
|
||||||
using CachedGraph = MPSUnaryGradCachedGraph;
|
|
||||||
auto gradMemFormat = grad_input.suggest_memory_format();
|
|
||||||
bool executeGatherOp = !(grad_output.is_contiguous(gradMemFormat) && self_or_result.is_contiguous(gradMemFormat) &&
|
|
||||||
grad_input.is_contiguous(gradMemFormat));
|
|
||||||
Tensor out;
|
|
||||||
if (executeGatherOp && gradMemFormat == MemoryFormat::ChannelsLast) {
|
|
||||||
out = at::empty_like(grad_input, MemoryFormat::Contiguous);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Empty output
|
|
||||||
if (grad_input.numel() == 0) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
MPSStream* stream = getCurrentMPSStream();
|
|
||||||
|
|
||||||
@autoreleasepool {
|
|
||||||
std::string key = "elu_backward_out_mps:" + getTensorsStringKey({grad_output, self_or_result}) + ":" +
|
|
||||||
std::to_string(alpha.to<double>()) + ":" + std::to_string(scale.to<double>()) + ":" +
|
|
||||||
std::to_string(input_scale.to<double>()) + ":" + std::to_string(is_result);
|
|
||||||
|
|
||||||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
|
||||||
MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
|
|
||||||
MPSGraphTensor* selfOrResultTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_or_result);
|
|
||||||
MPSGraphTensor* lessThanZeroGradTensor = nil;
|
|
||||||
|
|
||||||
if (is_result) {
|
|
||||||
MPSGraphTensor* alphaTensor = [mpsGraph constantWithScalar:alpha.to<double>()
|
|
||||||
shape:@[ @1 ]
|
|
||||||
dataType:getMPSDataType(grad_output)];
|
|
||||||
MPSGraphTensor* resultPlusAlphaTensor = [mpsGraph additionWithPrimaryTensor:selfOrResultTensor
|
|
||||||
secondaryTensor:alphaTensor
|
|
||||||
name:nil];
|
|
||||||
auto constMul = scale.to<double>() * input_scale.to<double>();
|
|
||||||
MPSGraphTensor* constMulTensor = [mpsGraph constantWithScalar:constMul
|
|
||||||
shape:@[ @1 ]
|
|
||||||
dataType:getMPSDataType(grad_output)];
|
|
||||||
lessThanZeroGradTensor = [mpsGraph multiplicationWithPrimaryTensor:resultPlusAlphaTensor
|
|
||||||
secondaryTensor:constMulTensor
|
|
||||||
name:nil];
|
|
||||||
} else {
|
|
||||||
MPSGraphTensor* inputScaleTensor = [mpsGraph constantWithScalar:input_scale.to<double>()
|
|
||||||
shape:@[ @1 ]
|
|
||||||
dataType:getMPSDataType(grad_output)];
|
|
||||||
MPSGraphTensor* scaledInputTensor = [mpsGraph multiplicationWithPrimaryTensor:selfOrResultTensor
|
|
||||||
secondaryTensor:inputScaleTensor
|
|
||||||
name:nil];
|
|
||||||
MPSGraphTensor* expTensor = [mpsGraph exponentWithTensor:scaledInputTensor name:nil];
|
|
||||||
auto constMul = scale.to<double>() * input_scale.to<double>() * alpha.to<double>();
|
|
||||||
MPSGraphTensor* constMulTensor = [mpsGraph constantWithScalar:constMul
|
|
||||||
shape:@[ @1 ]
|
|
||||||
dataType:getMPSDataType(grad_output)];
|
|
||||||
lessThanZeroGradTensor = [mpsGraph multiplicationWithPrimaryTensor:expTensor
|
|
||||||
secondaryTensor:constMulTensor
|
|
||||||
name:nil];
|
|
||||||
}
|
|
||||||
|
|
||||||
MPSGraphTensor* scaleTensor = [mpsGraph constantWithScalar:scale.to<double>()
|
|
||||||
shape:@[ @1 ]
|
|
||||||
dataType:getMPSDataType(grad_output)];
|
|
||||||
MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0f
|
|
||||||
shape:@[ @1 ]
|
|
||||||
dataType:getMPSDataType(grad_output)];
|
|
||||||
MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:selfOrResultTensor
|
|
||||||
secondaryTensor:zeroTensor
|
|
||||||
name:nil];
|
|
||||||
MPSGraphTensor* gradTensor = [mpsGraph selectWithPredicateTensor:predicateTensor
|
|
||||||
truePredicateTensor:scaleTensor
|
|
||||||
falsePredicateTensor:lessThanZeroGradTensor
|
|
||||||
name:nil];
|
|
||||||
MPSGraphTensor* gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor:gradTensor
|
|
||||||
secondaryTensor:gradOutputTensor
|
|
||||||
name:nil];
|
|
||||||
|
|
||||||
newCachedGraph->gradOutputTensor_ = gradOutputTensor;
|
|
||||||
newCachedGraph->inputTensor_ = selfOrResultTensor;
|
|
||||||
newCachedGraph->gradInputTensor_ = gradInputTensor;
|
|
||||||
});
|
|
||||||
|
|
||||||
Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output, nil, executeGatherOp);
|
|
||||||
Placeholder selfOrResultPlaceholder = Placeholder(cachedGraph->inputTensor_, self_or_result, nil, executeGatherOp);
|
|
||||||
Placeholder gradInputPlaceholder =
|
|
||||||
Placeholder(cachedGraph->gradInputTensor_, out.has_storage() ? out : grad_input, nil, false);
|
|
||||||
|
|
||||||
auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, selfOrResultPlaceholder);
|
|
||||||
runMPSGraph(stream, cachedGraph->graph(), feeds, gradInputPlaceholder);
|
|
||||||
if (out.has_storage()) {
|
|
||||||
grad_input.copy_(out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
TORCH_IMPL_FUNC(glu_out_mps)(const Tensor& self, const int64_t dim, const Tensor& output) {
|
TORCH_IMPL_FUNC(glu_out_mps)(const Tensor& self, const int64_t dim, const Tensor& output) {
|
||||||
using namespace mps;
|
using namespace mps;
|
||||||
using CachedGraph = MPSUnaryCachedGraph;
|
using CachedGraph = MPSUnaryCachedGraph;
|
||||||
@ -896,6 +715,7 @@ TORCH_IMPL_FUNC(glu_out_mps)(const Tensor& self, const int64_t dim, const Tensor
|
|||||||
if (output.numel() == 0)
|
if (output.numel() == 0)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(self.scalar_type() != kLong, "MPS doesn't know how to do exponent_i64");
|
||||||
// this can't pass anyway because a 0-dimensional tensor has "size" 1, which
|
// this can't pass anyway because a 0-dimensional tensor has "size" 1, which
|
||||||
// can't be evenly halved, but give a nicer error message here.
|
// can't be evenly halved, but give a nicer error message here.
|
||||||
TORCH_CHECK(self.dim() > 0, "glu does not support 0-dimensional tensors");
|
TORCH_CHECK(self.dim() > 0, "glu does not support 0-dimensional tensors");
|
||||||
@ -1009,6 +829,7 @@ TORCH_IMPL_FUNC(softplus_out_mps)
|
|||||||
(const Tensor& self, const Scalar& beta, const Scalar& threshold, const Tensor& result) {
|
(const Tensor& self, const Scalar& beta, const Scalar& threshold, const Tensor& result) {
|
||||||
using namespace mps;
|
using namespace mps;
|
||||||
TORCH_CHECK(self.is_mps());
|
TORCH_CHECK(self.is_mps());
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(self.scalar_type() != kLong, "Not implemented for long");
|
||||||
// Applies the Softplus function :math:`\text{Softplus}(x) = \frac{1}{\beta} *
|
// Applies the Softplus function :math:`\text{Softplus}(x) = \frac{1}{\beta} *
|
||||||
// \log(1 + \exp(\beta * x))` element-wise.
|
// \log(1 + \exp(\beta * x))` element-wise.
|
||||||
// For numerical stability the implementation reverts to the linear function
|
// For numerical stability the implementation reverts to the linear function
|
||||||
@ -1159,6 +980,8 @@ TORCH_IMPL_FUNC(mish_out_mps)
|
|||||||
(const Tensor& self, const Tensor& result) {
|
(const Tensor& self, const Tensor& result) {
|
||||||
using namespace mps;
|
using namespace mps;
|
||||||
TORCH_CHECK(self.is_mps());
|
TORCH_CHECK(self.is_mps());
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(self.scalar_type() != kLong, "MPS doesn't know how to do exponent_i64");
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(!c10::isComplexType(self.scalar_type()), "Mish for complex is not supported for MPS");
|
||||||
|
|
||||||
if (result.numel() == 0)
|
if (result.numel() == 0)
|
||||||
return;
|
return;
|
||||||
@ -1207,6 +1030,8 @@ TORCH_IMPL_FUNC(mish_out_mps)
|
|||||||
Tensor mish_backward_mps(const Tensor& grad_output, const Tensor& self) {
|
Tensor mish_backward_mps(const Tensor& grad_output, const Tensor& self) {
|
||||||
using namespace mps;
|
using namespace mps;
|
||||||
TORCH_CHECK(self.is_mps());
|
TORCH_CHECK(self.is_mps());
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(self.scalar_type() != kLong, "MPS doesn't know how to do exponent_i64");
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(!c10::isComplexType(self.scalar_type()), "Mish for complex is not supported for MPS");
|
||||||
|
|
||||||
Tensor grad_input = at::empty_like(self, self.suggest_memory_format());
|
Tensor grad_input = at::empty_like(self, self.suggest_memory_format());
|
||||||
if (grad_input.numel() == 0)
|
if (grad_input.numel() == 0)
|
||||||
@ -1396,6 +1221,7 @@ TORCH_IMPL_FUNC(silu_out_mps)(const Tensor& self, const Tensor& result) {
|
|||||||
using CachedGraph = MPSUnaryCachedGraph;
|
using CachedGraph = MPSUnaryCachedGraph;
|
||||||
|
|
||||||
TORCH_CHECK(self.is_mps());
|
TORCH_CHECK(self.is_mps());
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(self.scalar_type() != kLong, "MPS doesn't know how to do exponent_i64");
|
||||||
|
|
||||||
// Empty output
|
// Empty output
|
||||||
if (result.numel() == 0)
|
if (result.numel() == 0)
|
||||||
|
|||||||
@ -1,8 +1,10 @@
|
|||||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||||
|
#include <ATen/Dispatch.h>
|
||||||
#include <ATen/TensorIterator.h>
|
#include <ATen/TensorIterator.h>
|
||||||
#include <ATen/mps/MPSProfiler.h>
|
#include <ATen/mps/MPSProfiler.h>
|
||||||
#include <ATen/native/Activation.h>
|
#include <ATen/native/Activation.h>
|
||||||
#include <ATen/native/mps/OperationUtils.h>
|
#include <ATen/native/mps/OperationUtils.h>
|
||||||
|
#include <ATen/native/mps/kernels/Activation.h>
|
||||||
#include <fmt/format.h>
|
#include <fmt/format.h>
|
||||||
|
|
||||||
namespace at::native {
|
namespace at::native {
|
||||||
@ -41,6 +43,30 @@ static void hardswish_backward_kernel(at::TensorIterator& iter) {
|
|||||||
lib.exec_binary_kernel(iter, "hardswish_backward");
|
lib.exec_binary_kernel(iter, "hardswish_backward");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void elu_kernel(TensorIteratorBase& iter, const Scalar& alpha, const Scalar& scale, const Scalar& input_scale) {
|
||||||
|
AT_DISPATCH_FLOATING_TYPES_AND2(c10::kHalf, c10::kBFloat16, iter.common_dtype(), "elu_mps", [&]() {
|
||||||
|
ELUParams<scalar_t> params{alpha.to<scalar_t>(), scale.to<scalar_t>(), input_scale.to<scalar_t>()};
|
||||||
|
lib.exec_unary_kernel_with_params(
|
||||||
|
iter, "elu", params, fmt::format("ELUParams_{}", mps::scalarToMetalTypeString(iter.common_dtype())));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
static void elu_backward_kernel(TensorIteratorBase& iter,
|
||||||
|
const Scalar& alpha,
|
||||||
|
const Scalar& scale,
|
||||||
|
const Scalar& input_scale,
|
||||||
|
bool is_result) {
|
||||||
|
AT_DISPATCH_FLOATING_TYPES_AND2(c10::kHalf, c10::kBFloat16, iter.common_dtype(), "elu_backward_mps", [&]() {
|
||||||
|
ELUBackwardParams<scalar_t> params{
|
||||||
|
alpha.to<scalar_t>(), scale.to<scalar_t>(), input_scale.to<scalar_t>(), is_result};
|
||||||
|
lib.exec_binary_kernel_with_params(
|
||||||
|
iter,
|
||||||
|
"elu_backward",
|
||||||
|
params,
|
||||||
|
fmt::format("ELUBackwardParams_{}", mps::scalarToMetalTypeString(iter.common_dtype())));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
static void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negative_slope) {
|
static void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negative_slope) {
|
||||||
lib.exec_unary_kernel(iter, "leaky_relu", negative_slope);
|
lib.exec_unary_kernel(iter, "leaky_relu", negative_slope);
|
||||||
}
|
}
|
||||||
@ -56,6 +82,8 @@ REGISTER_DISPATCH(hardsigmoid_stub, hardsigmoid_kernel);
|
|||||||
REGISTER_DISPATCH(hardsigmoid_backward_stub, hardsigmoid_backward_kernel);
|
REGISTER_DISPATCH(hardsigmoid_backward_stub, hardsigmoid_backward_kernel);
|
||||||
REGISTER_DISPATCH(hardswish_stub, hardswish_kernel);
|
REGISTER_DISPATCH(hardswish_stub, hardswish_kernel);
|
||||||
REGISTER_DISPATCH(hardswish_backward_stub, hardswish_backward_kernel);
|
REGISTER_DISPATCH(hardswish_backward_stub, hardswish_backward_kernel);
|
||||||
|
REGISTER_DISPATCH(elu_stub, elu_kernel);
|
||||||
|
REGISTER_DISPATCH(elu_backward_stub, elu_backward_kernel);
|
||||||
REGISTER_DISPATCH(leaky_relu_stub, leaky_relu_kernel);
|
REGISTER_DISPATCH(leaky_relu_stub, leaky_relu_kernel);
|
||||||
REGISTER_DISPATCH(leaky_relu_backward_stub, leaky_relu_backward_kernel);
|
REGISTER_DISPATCH(leaky_relu_backward_stub, leaky_relu_backward_kernel);
|
||||||
|
|
||||||
|
|||||||
@ -80,6 +80,11 @@ static void grid_sampler_2d_mps_impl(Tensor& output,
|
|||||||
MPSGraphTensor* outputTensor_ = nil;
|
MPSGraphTensor* outputTensor_ = nil;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Crashes with
|
||||||
|
// MPSGraphUtilities.mm:97:0: error: 'mps.sample_grid' op operand #0 must be tensor of mps native type values, but got
|
||||||
|
// 'tensor<2x3x5x20xcomplex<f32>>'
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(!c10::isComplexType(input.scalar_type()),
|
||||||
|
"grid_sampler_2d is not supported for complex on MPS");
|
||||||
@autoreleasepool {
|
@autoreleasepool {
|
||||||
std::string key = "grid_sampler_2d_mps" + getTensorsStringKey({input, grid}) + ":" +
|
std::string key = "grid_sampler_2d_mps" + getTensorsStringKey({input, grid}) + ":" +
|
||||||
std::to_string(interpolation_mode) + ":" + std::to_string(padding_mode) + ":" + std::to_string(align_corners);
|
std::to_string(interpolation_mode) + ":" + std::to_string(padding_mode) + ":" + std::to_string(align_corners);
|
||||||
|
|||||||
@ -240,7 +240,7 @@ static void linalg_lu_factor_ex_out_mps_impl(const Tensor& A,
|
|||||||
bool check_errors) {
|
bool check_errors) {
|
||||||
using namespace mps;
|
using namespace mps;
|
||||||
|
|
||||||
TORCH_CHECK(!c10::isComplexType(A.scalar_type()) && !c10::isComplexType(LU.scalar_type()),
|
TORCH_CHECK(A.scalar_type() == kFloat && LU.scalar_type() == kFloat,
|
||||||
"linalg.lu_factor(): MPS doesn't support complex types.");
|
"linalg.lu_factor(): MPS doesn't support complex types.");
|
||||||
TORCH_CHECK(pivot, "linalg.lu_factor(): MPS doesn't allow pivot == False.");
|
TORCH_CHECK(pivot, "linalg.lu_factor(): MPS doesn't allow pivot == False.");
|
||||||
|
|
||||||
@ -364,8 +364,7 @@ static void linalg_solve_out_mps_impl(const Tensor& A,
|
|||||||
const Tensor& info) {
|
const Tensor& info) {
|
||||||
using namespace mps;
|
using namespace mps;
|
||||||
|
|
||||||
TORCH_CHECK(!c10::isComplexType(A.scalar_type()) && !c10::isComplexType(LU.scalar_type()),
|
TORCH_CHECK(A.scalar_type() == kFloat && LU.scalar_type() == kFloat, "linalg.lu_factor(): MPS only supports floats.");
|
||||||
"linalg.lu_factor(): MPS doesn't support complex types.");
|
|
||||||
Tensor A_t, B_t;
|
Tensor A_t, B_t;
|
||||||
// If 'left' is false, reinterpret the problem so that Ax = B becomes A^T ⋅ (x^T) = B^T
|
// If 'left' is false, reinterpret the problem so that Ax = B becomes A^T ⋅ (x^T) = B^T
|
||||||
// Then we solve the normal "left" case on the transposed matrices and transpose x finally to get the output
|
// Then we solve the normal "left" case on the transposed matrices and transpose x finally to get the output
|
||||||
@ -1058,7 +1057,8 @@ static Tensor& linalg_solve_triangular_mps_impl(const Tensor& A,
|
|||||||
using namespace mps;
|
using namespace mps;
|
||||||
|
|
||||||
checkInputsSolver(A, B, left, "linalg.solve_triangular");
|
checkInputsSolver(A, B, left, "linalg.solve_triangular");
|
||||||
TORCH_CHECK(!A.is_complex() && !B.is_complex(), "linalg.solve.triangular(); Not supported for complex yet!");
|
TORCH_CHECK(A.scalar_type() == kFloat && B.scalar_type() == kFloat,
|
||||||
|
"linalg.solve.triangular(); Only float is supported!");
|
||||||
Tensor A_t, B_t;
|
Tensor A_t, B_t;
|
||||||
std::tie(B_t, A_t) = _linalg_broadcast_batch_dims(B, A, /*don't check errors*/ nullptr);
|
std::tie(B_t, A_t) = _linalg_broadcast_batch_dims(B, A, /*don't check errors*/ nullptr);
|
||||||
at::native::resize_output(out, B_t.sizes());
|
at::native::resize_output(out, B_t.sizes());
|
||||||
|
|||||||
@ -416,6 +416,8 @@ static void nllnd_loss_forward_impl(Tensor& output,
|
|||||||
int64_t reduction,
|
int64_t reduction,
|
||||||
int64_t ignore_index,
|
int64_t ignore_index,
|
||||||
bool is2D) {
|
bool is2D) {
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(!c10::isComplexType(output.scalar_type()),
|
||||||
|
"nlld_loss for complex is not supported for MPS");
|
||||||
std::vector<long long> reshapedTarget(target_arg.sizes().begin(), target_arg.sizes().end());
|
std::vector<long long> reshapedTarget(target_arg.sizes().begin(), target_arg.sizes().end());
|
||||||
reshapedTarget.push_back(1);
|
reshapedTarget.push_back(1);
|
||||||
|
|
||||||
@ -824,6 +826,9 @@ static void smooth_l1_loss_backward_impl(const Tensor& grad_output,
|
|||||||
Tensor& huber_loss_out_mps(const Tensor& input, const Tensor& target, int64_t reduction, double delta, Tensor& output) {
|
Tensor& huber_loss_out_mps(const Tensor& input, const Tensor& target, int64_t reduction, double delta, Tensor& output) {
|
||||||
std::string op_name = __func__;
|
std::string op_name = __func__;
|
||||||
using namespace mps;
|
using namespace mps;
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(input.scalar_type() != kLong, "MPS doesn't know how to do square_i64");
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(!c10::isComplexType(input.scalar_type()),
|
||||||
|
"huber_loss for complex is not supported for MPS");
|
||||||
TORCH_CHECK(delta > 0, "huber_loss does not support non-positive values for delta.")
|
TORCH_CHECK(delta > 0, "huber_loss does not support non-positive values for delta.")
|
||||||
TORCH_CHECK(target.is_same_size(input), op_name + ": target and input tensors must have identical shapes")
|
TORCH_CHECK(target.is_same_size(input), op_name + ": target and input tensors must have identical shapes")
|
||||||
TORCH_CHECK(output.is_mps());
|
TORCH_CHECK(output.is_mps());
|
||||||
|
|||||||
@ -597,6 +597,7 @@ static void avg_pool2d_template(const Tensor& input,
|
|||||||
bool count_include_pad,
|
bool count_include_pad,
|
||||||
const std::optional<int64_t> divisor_override,
|
const std::optional<int64_t> divisor_override,
|
||||||
const std::string& op_name) {
|
const std::string& op_name) {
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(!c10::isComplexType(input.scalar_type()), "Not implemented for complex");
|
||||||
const Tensor& grad_output = *(at::borrow_from_optional_tensor(grad_output_opt));
|
const Tensor& grad_output = *(at::borrow_from_optional_tensor(grad_output_opt));
|
||||||
const bool is_backward_pass = grad_output.defined();
|
const bool is_backward_pass = grad_output.defined();
|
||||||
const bool use_divisor = divisor_override.has_value() && divisor_override.value() != 0;
|
const bool use_divisor = divisor_override.has_value() && divisor_override.value() != 0;
|
||||||
@ -915,6 +916,8 @@ TORCH_IMPL_FUNC(max_pool2d_with_indices_out_mps)
|
|||||||
bool ceil_mode,
|
bool ceil_mode,
|
||||||
const Tensor& output,
|
const Tensor& output,
|
||||||
const Tensor& indices) {
|
const Tensor& indices) {
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(!c10::isComplexType(input.scalar_type()),
|
||||||
|
"Max pooling for complex is not supported for MPS");
|
||||||
bool use_graph = use_graph_for_max_pool2d(kernel_size, stride);
|
bool use_graph = use_graph_for_max_pool2d(kernel_size, stride);
|
||||||
if (use_graph) {
|
if (use_graph) {
|
||||||
auto indices_memory_format = indices.suggest_memory_format();
|
auto indices_memory_format = indices.suggest_memory_format();
|
||||||
@ -967,6 +970,8 @@ TORCH_IMPL_FUNC(max_pool2d_with_indices_backward_out_mps)
|
|||||||
bool ceil_mode,
|
bool ceil_mode,
|
||||||
const Tensor& indices,
|
const Tensor& indices,
|
||||||
const Tensor& grad_input) {
|
const Tensor& grad_input) {
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(!c10::isComplexType(input.scalar_type()),
|
||||||
|
"Max pooling for complex is not supported for MPS");
|
||||||
mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
|
mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
|
||||||
MPSGraph* mpsGraph = cachedGraph.graph();
|
MPSGraph* mpsGraph = cachedGraph.graph();
|
||||||
return [mpsGraph maxPooling2DGradientWithGradientTensor:cachedGraph.gradOutputTensor
|
return [mpsGraph maxPooling2DGradientWithGradientTensor:cachedGraph.gradOutputTensor
|
||||||
|
|||||||
@ -269,17 +269,22 @@ static void reduction_out_mps(const Tensor& input_t,
|
|||||||
name:nil];
|
name:nil];
|
||||||
castOutputTensor = [mpsGraph reductionSumWithTensor:bandPartWithTensor axes:@[ @0, @1 ] name:nil];
|
castOutputTensor = [mpsGraph reductionSumWithTensor:bandPartWithTensor axes:@[ @0, @1 ] name:nil];
|
||||||
} else if (reduction_type == MPSReductionType::NANSUM) {
|
} else if (reduction_type == MPSReductionType::NANSUM) {
|
||||||
// Create a 0 tensor of the same shape as inputTensor
|
// Integral types cannot contain NaN, so just do regular sum
|
||||||
MPSGraphTensor* zeros = [mpsGraph constantWithScalar:0.0 dataType:castInputTensor.dataType];
|
if (([castInputTensor dataType] & MPSDataTypeFloatBit) == 0) {
|
||||||
// Find NaNs
|
castOutputTensor = [mpsGraph reductionSumWithTensor:castInputTensor axes:wrappedAxes name:nil];
|
||||||
MPSGraphTensor* nanMask = [mpsGraph isNaNWithTensor:castInputTensor name:nil];
|
} else {
|
||||||
// Replace NaNs with 0
|
// Create a 0 tensor of the same shape as inputTensor
|
||||||
MPSGraphTensor* nanReplaced = [mpsGraph selectWithPredicateTensor:nanMask
|
auto zeros = [mpsGraph constantWithScalar:0.0 dataType:castInputTensor.dataType];
|
||||||
truePredicateTensor:zeros
|
// Find NaNs
|
||||||
falsePredicateTensor:castInputTensor
|
auto nanMask = [mpsGraph isNaNWithTensor:castInputTensor name:nil];
|
||||||
name:nil];
|
// Replace NaNs with 0
|
||||||
// Sum
|
auto nanReplaced = [mpsGraph selectWithPredicateTensor:nanMask
|
||||||
castOutputTensor = [mpsGraph reductionSumWithTensor:nanReplaced axes:wrappedAxes name:nil];
|
truePredicateTensor:zeros
|
||||||
|
falsePredicateTensor:castInputTensor
|
||||||
|
name:nil];
|
||||||
|
// Sum
|
||||||
|
castOutputTensor = [mpsGraph reductionSumWithTensor:nanReplaced axes:wrappedAxes name:nil];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
MPSGraphTensor* outputTensor = castOutputTensor;
|
MPSGraphTensor* outputTensor = castOutputTensor;
|
||||||
@ -442,6 +447,7 @@ static Tensor std_var_common_impl_mps(const Tensor& input_t,
|
|||||||
const std::optional<Scalar>& correction,
|
const std::optional<Scalar>& correction,
|
||||||
bool keepdim,
|
bool keepdim,
|
||||||
StdVarType stdVarType) {
|
StdVarType stdVarType) {
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(input_t.scalar_type() != kLong, "Not implemented for MPS");
|
||||||
using CachedGraph = MPSUnaryCachedGraph;
|
using CachedGraph = MPSUnaryCachedGraph;
|
||||||
|
|
||||||
IntArrayRef input_shape = input_t.sizes();
|
IntArrayRef input_shape = input_t.sizes();
|
||||||
|
|||||||
@ -39,6 +39,7 @@ static void get_shapes(MPSShape* input_shape_readonly,
|
|||||||
TORCH_IMPL_FUNC(softmax_mps_out)
|
TORCH_IMPL_FUNC(softmax_mps_out)
|
||||||
(const Tensor& input_, const int64_t dim, const bool half_to_float, const Tensor& output) {
|
(const Tensor& input_, const int64_t dim, const bool half_to_float, const Tensor& output) {
|
||||||
TORCH_CHECK(!half_to_float, "softmax with half to float conversion is not supported on MPS");
|
TORCH_CHECK(!half_to_float, "softmax with half to float conversion is not supported on MPS");
|
||||||
|
TORCH_CHECK(c10::isFloatingType(input_.scalar_type()), "softmax only supported for floating types");
|
||||||
static const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
|
static const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
|
||||||
|
|
||||||
if (input_.numel() == 0) {
|
if (input_.numel() == 0) {
|
||||||
|
|||||||
@ -18,6 +18,10 @@ static Tensor& bincount_mps_impl(const Tensor& self, const Tensor& weights, Tens
|
|||||||
MPSStream* stream = getCurrentMPSStream();
|
MPSStream* stream = getCurrentMPSStream();
|
||||||
bool has_weights = weights.defined();
|
bool has_weights = weights.defined();
|
||||||
|
|
||||||
|
// Crashes with
|
||||||
|
// MPSGraphUtilities.mm:190:0: error: 'mps.scatter' op operand #2 must be tensor of int values, but got 'tensor<5xi1>'
|
||||||
|
TORCH_CHECK_NOT_IMPLEMENTED(self.scalar_type() != kBool, "bincount is not supported for Bool");
|
||||||
|
|
||||||
@autoreleasepool {
|
@autoreleasepool {
|
||||||
std::string key = "bincount_mps_impl" + getTensorsStringKey({self, weights});
|
std::string key = "bincount_mps_impl" + getTensorsStringKey({self, weights});
|
||||||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
||||||
|
|||||||
@ -12064,8 +12064,7 @@
|
|||||||
device_check: NoCheck # TensorIterator
|
device_check: NoCheck # TensorIterator
|
||||||
python_module: nn
|
python_module: nn
|
||||||
dispatch:
|
dispatch:
|
||||||
CPU, CUDA: elu_out
|
CPU, CUDA, MPS: elu_out
|
||||||
MPS: elu_out_mps
|
|
||||||
|
|
||||||
- func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor
|
- func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor
|
||||||
structured_delegate: elu.out
|
structured_delegate: elu.out
|
||||||
@ -12078,8 +12077,7 @@
|
|||||||
structured_inherits: TensorIteratorBase
|
structured_inherits: TensorIteratorBase
|
||||||
python_module: nn
|
python_module: nn
|
||||||
dispatch:
|
dispatch:
|
||||||
CPU, CUDA: elu_backward_out
|
CPU, CUDA, MPS: elu_backward_out
|
||||||
MPS: elu_backward_out_mps
|
|
||||||
|
|
||||||
- func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result) -> Tensor
|
- func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result) -> Tensor
|
||||||
structured_delegate: elu_backward.grad_input
|
structured_delegate: elu_backward.grad_input
|
||||||
|
|||||||
@ -65,6 +65,7 @@ list(APPEND ATen_CUDA_TEST_SRCS
|
|||||||
${CMAKE_CURRENT_SOURCE_DIR}/cuda_device_test.cpp
|
${CMAKE_CURRENT_SOURCE_DIR}/cuda_device_test.cpp
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/cuda_distributions_test.cu
|
${CMAKE_CURRENT_SOURCE_DIR}/cuda_distributions_test.cu
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/cuda_dlconvertor_test.cpp
|
${CMAKE_CURRENT_SOURCE_DIR}/cuda_dlconvertor_test.cpp
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/cuda_event_test.cpp
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/cuda_exchange_device_test.cpp
|
${CMAKE_CURRENT_SOURCE_DIR}/cuda_exchange_device_test.cpp
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/cuda_generator_test.cu
|
${CMAKE_CURRENT_SOURCE_DIR}/cuda_generator_test.cu
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/cuda_half_test.cu
|
${CMAKE_CURRENT_SOURCE_DIR}/cuda_half_test.cu
|
||||||
|
|||||||
36
aten/src/ATen/test/cuda_event_test.cpp
Normal file
36
aten/src/ATen/test/cuda_event_test.cpp
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
#include <gtest/gtest.h>
|
||||||
|
|
||||||
|
#include <ATen/cuda/CUDAEvent.h>
|
||||||
|
#include <ATen/cuda/CUDAGraph.h>
|
||||||
|
#include <ATen/cuda/Sleep.h>
|
||||||
|
|
||||||
|
TEST(CUDAEventTest, testCUDAExternalEvent) {
|
||||||
|
if (!at::cuda::is_available()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create two external CUDA events
|
||||||
|
unsigned int flags = cudaEventDefault | cudaEventExternal;
|
||||||
|
auto event1 = at::cuda::CUDAEvent(flags);
|
||||||
|
auto event2 = at::cuda::CUDAEvent(flags);
|
||||||
|
// Ensure external CUDAEvent remain valid and functional after being moved.
|
||||||
|
auto start_event = std::move(event1);
|
||||||
|
auto end_event = std::move(event2);
|
||||||
|
|
||||||
|
auto stream = at::cuda::getStreamFromPool();
|
||||||
|
at::cuda::setCurrentCUDAStream(stream);
|
||||||
|
|
||||||
|
auto graph = at::cuda::CUDAGraph();
|
||||||
|
graph.capture_begin();
|
||||||
|
start_event.record();
|
||||||
|
at::cuda::sleep(100000);
|
||||||
|
end_event.record();
|
||||||
|
graph.capture_end();
|
||||||
|
|
||||||
|
// External events should correctly record timestamps even when used inside
|
||||||
|
// CUDA graphs, and elapsed_time() between them should be positive.
|
||||||
|
stream.synchronize();
|
||||||
|
graph.replay();
|
||||||
|
at::cuda::device_synchronize();
|
||||||
|
EXPECT_TRUE(start_event.elapsed_time(end_event) > 0);
|
||||||
|
}
|
||||||
@ -48,7 +48,7 @@ void warnDeprecatedDataPtr() {
|
|||||||
TORCH_CHECK(false, "Cannot access data pointer of Storage that is invalid.");
|
TORCH_CHECK(false, "Cannot access data pointer of Storage that is invalid.");
|
||||||
}
|
}
|
||||||
|
|
||||||
void StorageImpl::incref_pyobject() const {
|
void StorageImpl::incref_pyobject() const noexcept {
|
||||||
// Because intrusive_ptr incref uses relaxed memory order, we need to
|
// Because intrusive_ptr incref uses relaxed memory order, we need to
|
||||||
// do an acquire fence to ensure that the kHasPyObject bit was
|
// do an acquire fence to ensure that the kHasPyObject bit was
|
||||||
// observed before the load of the PyObject* below.
|
// observed before the load of the PyObject* below.
|
||||||
@ -59,12 +59,12 @@ void StorageImpl::incref_pyobject() const {
|
|||||||
(*pyobj_slot_.pyobj_interpreter())->incref(obj);
|
(*pyobj_slot_.pyobj_interpreter())->incref(obj);
|
||||||
}
|
}
|
||||||
|
|
||||||
void StorageImpl::decref_pyobject() const {
|
void StorageImpl::decref_pyobject() const noexcept {
|
||||||
PyObject* obj = pyobj_slot_.load_pyobj();
|
PyObject* obj = pyobj_slot_.load_pyobj();
|
||||||
(*pyobj_slot_.pyobj_interpreter())->decref(obj);
|
(*pyobj_slot_.pyobj_interpreter())->decref(obj);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool StorageImpl::try_incref_pyobject() const {
|
bool StorageImpl::try_incref_pyobject() const noexcept {
|
||||||
c10::impl::PyInterpreter* interp = pyobj_slot_.pyobj_interpreter();
|
c10::impl::PyInterpreter* interp = pyobj_slot_.pyobj_interpreter();
|
||||||
if (C10_UNLIKELY(!interp)) {
|
if (C10_UNLIKELY(!interp)) {
|
||||||
return false;
|
return false;
|
||||||
|
|||||||
@ -105,11 +105,11 @@ struct C10_API StorageImpl : public c10::intrusive_ptr_target {
|
|||||||
data_ptr_.clear();
|
data_ptr_.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
void incref_pyobject() const override final;
|
void incref_pyobject() const noexcept override final;
|
||||||
|
|
||||||
void decref_pyobject() const override final;
|
void decref_pyobject() const noexcept override final;
|
||||||
|
|
||||||
bool try_incref_pyobject() const override final;
|
bool try_incref_pyobject() const noexcept override final;
|
||||||
|
|
||||||
size_t nbytes() const {
|
size_t nbytes() const {
|
||||||
// OK to do this instead of maybe_as_int as nbytes is guaranteed positive
|
// OK to do this instead of maybe_as_int as nbytes is guaranteed positive
|
||||||
|
|||||||
@ -988,7 +988,7 @@ void TensorImpl::empty_tensor_restride_symint(MemoryFormat memory_format) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void TensorImpl::incref_pyobject() const {
|
void TensorImpl::incref_pyobject() const noexcept {
|
||||||
// Because intrusive_ptr incref uses relaxed memory order, we need to
|
// Because intrusive_ptr incref uses relaxed memory order, we need to
|
||||||
// do an acquire fence to ensure that the kHasPyObject bit was
|
// do an acquire fence to ensure that the kHasPyObject bit was
|
||||||
// observed before the load of the PyObject* below.
|
// observed before the load of the PyObject* below.
|
||||||
@ -999,12 +999,12 @@ void TensorImpl::incref_pyobject() const {
|
|||||||
(*pyobj_slot_.pyobj_interpreter())->incref(obj);
|
(*pyobj_slot_.pyobj_interpreter())->incref(obj);
|
||||||
}
|
}
|
||||||
|
|
||||||
void TensorImpl::decref_pyobject() const {
|
void TensorImpl::decref_pyobject() const noexcept {
|
||||||
PyObject* obj = pyobj_slot_.load_pyobj();
|
PyObject* obj = pyobj_slot_.load_pyobj();
|
||||||
(*pyobj_slot_.pyobj_interpreter())->decref(obj);
|
(*pyobj_slot_.pyobj_interpreter())->decref(obj);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool TensorImpl::try_incref_pyobject() const {
|
bool TensorImpl::try_incref_pyobject() const noexcept {
|
||||||
c10::impl::PyInterpreter* interp = pyobj_slot_.pyobj_interpreter();
|
c10::impl::PyInterpreter* interp = pyobj_slot_.pyobj_interpreter();
|
||||||
if (C10_UNLIKELY(!interp)) {
|
if (C10_UNLIKELY(!interp)) {
|
||||||
return false;
|
return false;
|
||||||
|
|||||||
@ -2178,11 +2178,11 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
|
|||||||
return &pyobj_slot_;
|
return &pyobj_slot_;
|
||||||
}
|
}
|
||||||
|
|
||||||
void incref_pyobject() const override final;
|
void incref_pyobject() const noexcept override final;
|
||||||
|
|
||||||
void decref_pyobject() const override final;
|
void decref_pyobject() const noexcept override final;
|
||||||
|
|
||||||
bool try_incref_pyobject() const override final;
|
bool try_incref_pyobject() const noexcept override final;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// See NOTE [std::optional operator usage in CUDA]
|
// See NOTE [std::optional operator usage in CUDA]
|
||||||
|
|||||||
@ -68,6 +68,10 @@ inline bool has_pyobject(uint64_t combined_refcount) {
|
|||||||
return (combined_refcount & kHasPyObject) != 0;
|
return (combined_refcount & kHasPyObject) != 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline bool is_uniquely_owned(uint64_t combined_refcount) {
|
||||||
|
return (combined_refcount & ~detail::kHasPyObject) == detail::kUniqueRef;
|
||||||
|
}
|
||||||
|
|
||||||
// The only requirement for refcount increment is that it happens-before
|
// The only requirement for refcount increment is that it happens-before
|
||||||
// decrement, so no additional memory ordering is needed.
|
// decrement, so no additional memory ordering is needed.
|
||||||
inline uint64_t atomic_combined_refcount_increment(
|
inline uint64_t atomic_combined_refcount_increment(
|
||||||
@ -287,9 +291,9 @@ class C10_API intrusive_ptr_target {
|
|||||||
* These two methods are called when the refcount transitions between one
|
* These two methods are called when the refcount transitions between one
|
||||||
* and two and the object has a PyObject wrapper.
|
* and two and the object has a PyObject wrapper.
|
||||||
*/
|
*/
|
||||||
virtual void incref_pyobject() const {}
|
virtual void incref_pyobject() const noexcept {}
|
||||||
virtual void decref_pyobject() const {}
|
virtual void decref_pyobject() const noexcept {}
|
||||||
virtual bool try_incref_pyobject() const {
|
virtual bool try_incref_pyobject() const noexcept {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -363,7 +367,7 @@ class intrusive_ptr final {
|
|||||||
template <typename, typename...>
|
template <typename, typename...>
|
||||||
friend class pybind11::class_;
|
friend class pybind11::class_;
|
||||||
|
|
||||||
void retain_() {
|
void retain_() noexcept {
|
||||||
if (target_ != NullType::singleton()) {
|
if (target_ != NullType::singleton()) {
|
||||||
uint64_t combined = detail::atomic_combined_refcount_increment(
|
uint64_t combined = detail::atomic_combined_refcount_increment(
|
||||||
target_->combined_refcount_, detail::kReferenceCountOne);
|
target_->combined_refcount_, detail::kReferenceCountOne);
|
||||||
@ -377,9 +381,7 @@ class intrusive_ptr final {
|
|||||||
// PyObject. In other words, we need to ensure that the PyObject stays
|
// PyObject. In other words, we need to ensure that the PyObject stays
|
||||||
// alive now that we have a C++ reference to this object in addition to
|
// alive now that we have a C++ reference to this object in addition to
|
||||||
// the PyObject itself.
|
// the PyObject itself.
|
||||||
if (C10_UNLIKELY(
|
if (detail::has_pyobject(combined) && detail::refcount(combined) == 2) {
|
||||||
detail::has_pyobject(combined) &&
|
|
||||||
detail::refcount(combined) == 2)) {
|
|
||||||
target_->incref_pyobject();
|
target_->incref_pyobject();
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -392,51 +394,60 @@ class intrusive_ptr final {
|
|||||||
|
|
||||||
void reset_() noexcept {
|
void reset_() noexcept {
|
||||||
if (target_ != NullType::singleton()) {
|
if (target_ != NullType::singleton()) {
|
||||||
if (is_uniquely_owned()) {
|
reset_not_null_(target_);
|
||||||
// Both counts are 1, so there are no weak references and
|
}
|
||||||
// we are releasing the last strong reference. No other
|
}
|
||||||
// threads can observe the effects of this target_ deletion
|
|
||||||
// call (e.g. calling use_count()) without a data race.
|
// C10_NOINLINE to keep binary size a bit smaller. We pass TTarget* here
|
||||||
target_->combined_refcount_.store(0, std::memory_order_relaxed);
|
// to avoid an extra pointer dereference in the call from reset_().
|
||||||
delete target_;
|
C10_NOINLINE static void reset_not_null_(TTarget* target) noexcept {
|
||||||
|
if (detail::is_uniquely_owned(
|
||||||
|
target->combined_refcount_.load(std::memory_order_acquire))) {
|
||||||
|
// Both counts are 1, so there are no weak references and
|
||||||
|
// we are releasing the last strong reference. No other
|
||||||
|
// threads can observe the effects of this target deletion
|
||||||
|
// call (e.g. calling use_count()) without a data race.
|
||||||
|
target->combined_refcount_.store(0, std::memory_order_relaxed);
|
||||||
|
delete target;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto combined_refcount = detail::atomic_combined_refcount_decrement(
|
||||||
|
target->combined_refcount_, detail::kReferenceCountOne);
|
||||||
|
uint32_t new_refcount = detail::refcount(combined_refcount);
|
||||||
|
bool has_pyobject = detail::has_pyobject(combined_refcount);
|
||||||
|
if (new_refcount == 0) {
|
||||||
|
if (detail::weakcount(combined_refcount) == 1) {
|
||||||
|
delete target;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
// See comment above about weakcount. As long as refcount>0,
|
||||||
auto combined_refcount = detail::atomic_combined_refcount_decrement(
|
// weakcount is one larger than the actual number of weak references.
|
||||||
target_->combined_refcount_, detail::kReferenceCountOne);
|
// So we need to decrement it here.
|
||||||
uint32_t new_refcount = detail::refcount(combined_refcount);
|
release_resources_and_decrement_weakrefs_(target);
|
||||||
bool has_pyobject = detail::has_pyobject(combined_refcount);
|
} else if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
|
||||||
if (new_refcount == 0) {
|
// If the refcount transitioned from 2 to 1, we need to decref the
|
||||||
bool should_delete = detail::weakcount(combined_refcount) == 1;
|
// PyObject. In other words, we don't want to keep the PyObject alive if
|
||||||
// See comment above about weakcount. As long as refcount>0,
|
// there are no C++ references to this object other than the PyObject
|
||||||
// weakcount is one larger than the actual number of weak references.
|
// itself.
|
||||||
// So we need to decrement it here.
|
if (has_pyobject && new_refcount == 1) {
|
||||||
if (!should_delete) {
|
target->decref_pyobject();
|
||||||
// justification for const_cast: release_resources is basically a
|
|
||||||
// destructor and a destructor always mutates the object, even for
|
|
||||||
// const objects.
|
|
||||||
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
|
|
||||||
const_cast<std::remove_const_t<TTarget>*>(target_)
|
|
||||||
->release_resources();
|
|
||||||
should_delete = detail::atomic_weakcount_decrement(
|
|
||||||
target_->combined_refcount_) == 0;
|
|
||||||
}
|
|
||||||
if (should_delete) {
|
|
||||||
delete target_;
|
|
||||||
}
|
|
||||||
} else if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
|
|
||||||
// If the refcount transitioned from 2 to 1, we need to decref the
|
|
||||||
// PyObject. In other words, we don't want to keep the PyObject alive if
|
|
||||||
// there are no C++ references to this object other than the PyObject
|
|
||||||
// itself.
|
|
||||||
if (C10_UNLIKELY(has_pyobject && new_refcount == 1)) {
|
|
||||||
target_->decref_pyobject();
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
|
||||||
!has_pyobject,
|
|
||||||
"TargetTraits indicates that type cannot have PyObject, but refcount has PyObject bit set.");
|
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||||
|
!has_pyobject,
|
||||||
|
"TargetTraits indicates that type cannot have PyObject, but refcount has PyObject bit set.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
C10_NOINLINE static void release_resources_and_decrement_weakrefs_(
|
||||||
|
TTarget* target) noexcept {
|
||||||
|
// justification for const_cast: release_resources is basically a
|
||||||
|
// destructor and a destructor always mutates the object, even for
|
||||||
|
// const objects.
|
||||||
|
const_cast<std::remove_const_t<TTarget>*>(target)->release_resources();
|
||||||
|
if (detail::atomic_weakcount_decrement(target->combined_refcount_) == 0) {
|
||||||
|
delete target;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -607,9 +618,8 @@ class intrusive_ptr final {
|
|||||||
*/
|
*/
|
||||||
bool is_uniquely_owned() const noexcept {
|
bool is_uniquely_owned() const noexcept {
|
||||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(target_ != NullType::singleton());
|
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(target_ != NullType::singleton());
|
||||||
uint64_t combined =
|
return detail::is_uniquely_owned(
|
||||||
target_->combined_refcount_.load(std::memory_order_acquire);
|
target_->combined_refcount_.load(std::memory_order_acquire));
|
||||||
return (combined & ~detail::kHasPyObject) == detail::kUniqueRef;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -1174,9 +1184,7 @@ inline void incref(intrusive_ptr_target* self) {
|
|||||||
self->combined_refcount_, detail::kReferenceCountOne);
|
self->combined_refcount_, detail::kReferenceCountOne);
|
||||||
|
|
||||||
#ifndef C10_MOBILE
|
#ifndef C10_MOBILE
|
||||||
if (C10_UNLIKELY(
|
if (detail::has_pyobject(combined) && detail::refcount(combined) == 2) {
|
||||||
detail::has_pyobject(combined) &&
|
|
||||||
detail::refcount(combined) == 2)) {
|
|
||||||
self->incref_pyobject();
|
self->incref_pyobject();
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
|||||||
@ -893,11 +893,13 @@ class DeviceCachingAllocator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool release_cached_blocks(MempoolId_t mempool_id) {
|
bool release_cached_blocks(MempoolId_t mempool_id) {
|
||||||
|
bool streams_synced = false;
|
||||||
if (mempool_id.first == 0 && mempool_id.second == 0 &&
|
if (mempool_id.first == 0 && mempool_id.second == 0 &&
|
||||||
captures_underway.empty()) {
|
captures_underway.empty()) {
|
||||||
synchronize_and_free_events();
|
synchronize_and_free_events();
|
||||||
// See Note [Safe to Free Blocks on BlockPool]
|
// See Note [Safe to Free Blocks on BlockPool]
|
||||||
c10::xpu::syncStreamsOnDevice(device_index);
|
c10::xpu::syncStreamsOnDevice(device_index);
|
||||||
|
streams_synced = true;
|
||||||
|
|
||||||
release_blocks(large_blocks);
|
release_blocks(large_blocks);
|
||||||
release_blocks(small_blocks);
|
release_blocks(small_blocks);
|
||||||
@ -916,6 +918,12 @@ class DeviceCachingAllocator {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!streams_synced) {
|
||||||
|
// See Note [Safe to Free Blocks on BlockPool]
|
||||||
|
c10::xpu::syncStreamsOnDevice(device_index);
|
||||||
|
streams_synced = true;
|
||||||
|
}
|
||||||
TORCH_INTERNAL_ASSERT(it->second->use_count == 0);
|
TORCH_INTERNAL_ASSERT(it->second->use_count == 0);
|
||||||
release_blocks(it->second->small_blocks);
|
release_blocks(it->second->small_blocks);
|
||||||
release_blocks(it->second->large_blocks);
|
release_blocks(it->second->large_blocks);
|
||||||
@ -1219,6 +1227,63 @@ class DeviceCachingAllocator {
|
|||||||
allowed_memory_maximum = static_cast<size_t>(fraction * device_total);
|
allowed_memory_maximum = static_cast<size_t>(fraction * device_total);
|
||||||
set_fraction = true;
|
set_fraction = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void createOrIncrefPool(
|
||||||
|
MempoolId_t mempool_id,
|
||||||
|
XPUAllocator* allocator = nullptr) {
|
||||||
|
std::scoped_lock<std::recursive_mutex> lock(mutex);
|
||||||
|
create_or_incref_pool(mempool_id, allocator);
|
||||||
|
}
|
||||||
|
|
||||||
|
int getPoolUseCount(MempoolId_t mempool_id) {
|
||||||
|
std::scoped_lock<std::recursive_mutex> lock(mutex);
|
||||||
|
auto it = graph_pools.find(mempool_id);
|
||||||
|
if (it == graph_pools.end()) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return it->second->use_count;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Called by XPUGraph::capture_begin
|
||||||
|
void beginAllocateToPool(
|
||||||
|
MempoolId_t mempool_id,
|
||||||
|
std::function<bool(sycl::queue*)> filter) {
|
||||||
|
std::lock_guard<std::recursive_mutex> lock(mutex);
|
||||||
|
create_or_incref_pool(mempool_id);
|
||||||
|
auto not_found = std::all_of(
|
||||||
|
captures_underway.begin(),
|
||||||
|
captures_underway.end(),
|
||||||
|
[&](const auto& entry) { return entry.first != mempool_id; });
|
||||||
|
TORCH_CHECK(
|
||||||
|
not_found, "beginAllocateToPool: already recording to mempool_id");
|
||||||
|
captures_underway.emplace_back(mempool_id, std::move(filter));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Called by XPUGraph::capture_end
|
||||||
|
void endAllocateToPool(MempoolId_t mempool_id) {
|
||||||
|
std::lock_guard<std::recursive_mutex> lock(mutex);
|
||||||
|
|
||||||
|
auto it = std::find_if(
|
||||||
|
captures_underway.begin(),
|
||||||
|
captures_underway.end(),
|
||||||
|
[&](const auto& entry) { return entry.first == mempool_id; });
|
||||||
|
TORCH_INTERNAL_ASSERT(
|
||||||
|
it != captures_underway.end(),
|
||||||
|
"endAllocatePool: not currently recording to mempool_id");
|
||||||
|
captures_underway.erase(it);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Called by XPUGraph::reset and MemPool::~MemPool()
|
||||||
|
void releasePool(MempoolId_t mempool_id) {
|
||||||
|
std::lock_guard<std::recursive_mutex> lock(mutex);
|
||||||
|
auto pp = get_private_pool(mempool_id);
|
||||||
|
auto uc = --(pp->use_count);
|
||||||
|
TORCH_INTERNAL_ASSERT(uc >= 0);
|
||||||
|
if (uc == 0) {
|
||||||
|
bool inserted = graph_pools_freeable.insert({mempool_id, pp}).second;
|
||||||
|
TORCH_INTERNAL_ASSERT(inserted);
|
||||||
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static void local_raw_delete(void* ptr);
|
static void local_raw_delete(void* ptr);
|
||||||
@ -1408,6 +1473,39 @@ class XPUAllocator : public DeviceAllocator {
|
|||||||
". Please set within (0, 1].");
|
". Please set within (0, 1].");
|
||||||
device_allocators[device]->setMemoryFraction(fraction);
|
device_allocators[device]->setMemoryFraction(fraction);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void createOrIncrefPool(
|
||||||
|
c10::DeviceIndex device,
|
||||||
|
MempoolId_t mempool_id,
|
||||||
|
XPUAllocator* allocator) {
|
||||||
|
assertValidDevice(device);
|
||||||
|
device_allocators[device]->createOrIncrefPool(
|
||||||
|
std::move(mempool_id), allocator);
|
||||||
|
}
|
||||||
|
|
||||||
|
void beginAllocateToPool(
|
||||||
|
c10::DeviceIndex device,
|
||||||
|
MempoolId_t mempool_id,
|
||||||
|
std::function<bool(sycl::queue*)> filter) {
|
||||||
|
assertValidDevice(device);
|
||||||
|
device_allocators[device]->beginAllocateToPool(
|
||||||
|
std::move(mempool_id), std::move(filter));
|
||||||
|
}
|
||||||
|
|
||||||
|
void endAllocateToPool(c10::DeviceIndex device, MempoolId_t mempool_id) {
|
||||||
|
assertValidDevice(device);
|
||||||
|
device_allocators[device]->endAllocateToPool(mempool_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) {
|
||||||
|
assertValidDevice(device);
|
||||||
|
device_allocators[device]->releasePool(std::move(mempool_id));
|
||||||
|
}
|
||||||
|
|
||||||
|
int getPoolUseCount(c10::DeviceIndex device, MempoolId_t mempool_id) {
|
||||||
|
assertValidDevice(device);
|
||||||
|
return device_allocators[device]->getPoolUseCount(std::move(mempool_id));
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static XPUAllocator allocator;
|
static XPUAllocator allocator;
|
||||||
@ -1464,6 +1562,92 @@ void setMemoryFraction(double fraction, DeviceIndex device) {
|
|||||||
return allocator.setMemoryFraction(fraction, device);
|
return allocator.setMemoryFraction(fraction, device);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void createOrIncrefPool(
|
||||||
|
c10::DeviceIndex device,
|
||||||
|
MempoolId_t mempool_id,
|
||||||
|
XPUAllocator* allocator_ptr) {
|
||||||
|
return allocator.createOrIncrefPool(device, mempool_id, allocator_ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
void beginAllocateToPool(
|
||||||
|
c10::DeviceIndex device,
|
||||||
|
MempoolId_t mempool_id,
|
||||||
|
std::function<bool(sycl::queue*)> filter) {
|
||||||
|
return allocator.beginAllocateToPool(device, mempool_id, std::move(filter));
|
||||||
|
}
|
||||||
|
|
||||||
|
void endAllocateToPool(c10::DeviceIndex device, MempoolId_t mempool_id) {
|
||||||
|
return allocator.endAllocateToPool(device, mempool_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
void releasePool(c10::DeviceIndex device, MempoolId_t mempool_id) {
|
||||||
|
return allocator.releasePool(device, mempool_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
int getPoolUseCount(c10::DeviceIndex device, MempoolId_t mempool_id) {
|
||||||
|
return allocator.getPoolUseCount(device, mempool_id);
|
||||||
|
}
|
||||||
|
|
||||||
REGISTER_ALLOCATOR(kXPU, &allocator)
|
REGISTER_ALLOCATOR(kXPU, &allocator)
|
||||||
|
|
||||||
} // namespace c10::xpu::XPUCachingAllocator
|
} // namespace c10::xpu::XPUCachingAllocator
|
||||||
|
|
||||||
|
namespace c10::xpu {
|
||||||
|
|
||||||
|
// uid_ is incremented when a user creates a MemPool,
|
||||||
|
//
|
||||||
|
// uuid_ is incremented when XPUGraph creates a MemPool
|
||||||
|
// as a result of a user not providing a pool.
|
||||||
|
|
||||||
|
std::atomic<CaptureId_t> MemPool::uid_{1};
|
||||||
|
std::atomic<CaptureId_t> MemPool::uuid_{1};
|
||||||
|
|
||||||
|
MemPool::MemPool(
|
||||||
|
XPUCachingAllocator::XPUAllocator* allocator,
|
||||||
|
bool is_user_created,
|
||||||
|
bool use_on_oom)
|
||||||
|
: allocator_(allocator), is_user_created_(is_user_created) {
|
||||||
|
if (is_user_created_) {
|
||||||
|
id_ = {0, uid_++};
|
||||||
|
} else {
|
||||||
|
id_ = {uuid_++, 0};
|
||||||
|
}
|
||||||
|
device_ = c10::xpu::current_device();
|
||||||
|
XPUCachingAllocator::createOrIncrefPool(device_, id_, allocator);
|
||||||
|
if (use_on_oom) {
|
||||||
|
// XPU doesn't support use_on_oom yet
|
||||||
|
TORCH_WARN(
|
||||||
|
"XPUCachingAllocator::MemPool: use_on_oom is not supported on XPU");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
MemPool::~MemPool() {
|
||||||
|
TORCH_INTERNAL_ASSERT(use_count() == 1);
|
||||||
|
XPUCachingAllocator::releasePool(device_, id_);
|
||||||
|
c10::xpu::XPUCachingAllocator::emptyCache(id_); // release cached blocks
|
||||||
|
}
|
||||||
|
|
||||||
|
MempoolId_t MemPool::id() {
|
||||||
|
return id_;
|
||||||
|
}
|
||||||
|
|
||||||
|
XPUCachingAllocator::XPUAllocator* MemPool::allocator() {
|
||||||
|
return allocator_;
|
||||||
|
}
|
||||||
|
|
||||||
|
int MemPool::use_count() {
|
||||||
|
return XPUCachingAllocator::getPoolUseCount(device_, id_);
|
||||||
|
}
|
||||||
|
|
||||||
|
c10::DeviceIndex MemPool::device() {
|
||||||
|
return device_;
|
||||||
|
}
|
||||||
|
|
||||||
|
MempoolId_t MemPool::graph_pool_handle(bool is_user_created) {
|
||||||
|
if (is_user_created) {
|
||||||
|
return {0, uid_++};
|
||||||
|
}
|
||||||
|
return {uuid_++, 0};
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace c10::xpu
|
||||||
|
|||||||
@ -33,4 +33,59 @@ C10_XPU_API double getMemoryFraction(DeviceIndex device);
|
|||||||
|
|
||||||
C10_XPU_API void setMemoryFraction(double fraction, DeviceIndex device);
|
C10_XPU_API void setMemoryFraction(double fraction, DeviceIndex device);
|
||||||
|
|
||||||
|
class XPUAllocator;
|
||||||
|
|
||||||
|
C10_XPU_API void createOrIncrefPool(
|
||||||
|
c10::DeviceIndex device,
|
||||||
|
c10::MempoolId_t mempool_id,
|
||||||
|
XPUAllocator* allocator = nullptr);
|
||||||
|
|
||||||
|
C10_XPU_API void beginAllocateToPool(
|
||||||
|
c10::DeviceIndex device,
|
||||||
|
c10::MempoolId_t mempool_id,
|
||||||
|
std::function<bool(sycl::queue*)> filter);
|
||||||
|
|
||||||
|
C10_XPU_API void endAllocateToPool(
|
||||||
|
c10::DeviceIndex device,
|
||||||
|
c10::MempoolId_t mempool_id);
|
||||||
|
|
||||||
|
C10_XPU_API void releasePool(
|
||||||
|
c10::DeviceIndex device,
|
||||||
|
c10::MempoolId_t mempool_id);
|
||||||
|
|
||||||
|
C10_XPU_API int getPoolUseCount(
|
||||||
|
c10::DeviceIndex device,
|
||||||
|
c10::MempoolId_t mempool_id);
|
||||||
|
|
||||||
} // namespace c10::xpu::XPUCachingAllocator
|
} // namespace c10::xpu::XPUCachingAllocator
|
||||||
|
|
||||||
|
namespace c10::xpu {
|
||||||
|
|
||||||
|
using c10::CaptureId_t;
|
||||||
|
using c10::MempoolId_t;
|
||||||
|
struct C10_XPU_API MemPool {
|
||||||
|
MemPool(
|
||||||
|
XPUCachingAllocator::XPUAllocator* allocator = nullptr,
|
||||||
|
bool is_user_created = true,
|
||||||
|
bool use_on_oom = false);
|
||||||
|
MemPool(const MemPool&) = delete;
|
||||||
|
MemPool(MemPool&&) = default;
|
||||||
|
MemPool& operator=(const MemPool&) = delete;
|
||||||
|
MemPool& operator=(MemPool&&) = default;
|
||||||
|
~MemPool();
|
||||||
|
|
||||||
|
MempoolId_t id();
|
||||||
|
XPUCachingAllocator::XPUAllocator* allocator();
|
||||||
|
int use_count();
|
||||||
|
c10::DeviceIndex device();
|
||||||
|
static MempoolId_t graph_pool_handle(bool is_user_created = true);
|
||||||
|
|
||||||
|
private:
|
||||||
|
static std::atomic<CaptureId_t> uid_;
|
||||||
|
static std::atomic<CaptureId_t> uuid_;
|
||||||
|
XPUCachingAllocator::XPUAllocator* allocator_;
|
||||||
|
bool is_user_created_;
|
||||||
|
MempoolId_t id_;
|
||||||
|
c10::DeviceIndex device_;
|
||||||
|
};
|
||||||
|
} // namespace c10::xpu
|
||||||
|
|||||||
@ -3,7 +3,11 @@
|
|||||||
|
|
||||||
#include "tensor_accessor_kernel.h"
|
#include "tensor_accessor_kernel.h"
|
||||||
|
|
||||||
|
#ifdef USE_ROCM
|
||||||
|
#include <hip/hip_runtime.h>
|
||||||
|
#else
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
#endif
|
||||||
#include <torch/csrc/stable/library.h>
|
#include <torch/csrc/stable/library.h>
|
||||||
#include <torch/csrc/stable/ops.h>
|
#include <torch/csrc/stable/ops.h>
|
||||||
#include <torch/csrc/stable/tensor.h>
|
#include <torch/csrc/stable/tensor.h>
|
||||||
|
|||||||
@ -0,0 +1,20 @@
|
|||||||
|
#include <torch/csrc/stable/library.h>
|
||||||
|
#include <torch/csrc/stable/tensor.h>
|
||||||
|
#include <torch/csrc/stable/ops.h>
|
||||||
|
|
||||||
|
using torch::stable::Tensor;
|
||||||
|
|
||||||
|
torch::headeronly::HeaderOnlyArrayRef<int64_t> my_shape(Tensor t) {
|
||||||
|
return t.sizes();
|
||||||
|
}
|
||||||
|
|
||||||
|
STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic_2_10, m) {
|
||||||
|
m.def("my_shape(Tensor t) -> int[]");
|
||||||
|
}
|
||||||
|
|
||||||
|
STABLE_TORCH_LIBRARY_IMPL(
|
||||||
|
libtorch_agnostic_2_10,
|
||||||
|
CompositeExplicitAutograd,
|
||||||
|
m) {
|
||||||
|
m.impl("my_shape", TORCH_BOX(&my_shape));
|
||||||
|
}
|
||||||
@ -199,6 +199,18 @@ def my_view(t, size) -> Tensor:
|
|||||||
return torch.ops.libtorch_agnostic_2_10.my_view.default(t, size)
|
return torch.ops.libtorch_agnostic_2_10.my_view.default(t, size)
|
||||||
|
|
||||||
|
|
||||||
|
def my_shape(t) -> tuple[int]:
|
||||||
|
"""
|
||||||
|
Returns a shape of the input tensor.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
t: Tensor - input tensor
|
||||||
|
|
||||||
|
Returns: tuple - shape of the imput tensor.
|
||||||
|
"""
|
||||||
|
return torch.ops.libtorch_agnostic_2_10.my_shape.default(t)
|
||||||
|
|
||||||
|
|
||||||
def get_any_data_ptr(t, mutable) -> int:
|
def get_any_data_ptr(t, mutable) -> int:
|
||||||
"""
|
"""
|
||||||
Return data pointer value of the tensor.
|
Return data pointer value of the tensor.
|
||||||
|
|||||||
@ -22,9 +22,15 @@ import tempfile
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from torch.testing._internal.common_utils import IS_WINDOWS, run_tests, TestCase
|
from torch.testing._internal.common_utils import IS_WINDOWS, run_tests, TestCase
|
||||||
from torch.utils.cpp_extension import CUDA_HOME, include_paths as torch_include_paths
|
from torch.utils.cpp_extension import (
|
||||||
|
CUDA_HOME,
|
||||||
|
include_paths as torch_include_paths,
|
||||||
|
ROCM_HOME,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
GPU_HOME = CUDA_HOME or ROCM_HOME
|
||||||
|
|
||||||
# TODO: Fix this error in Windows:
|
# TODO: Fix this error in Windows:
|
||||||
# numba.cuda.cudadrv.driver:driver.py:384 Call to cuInit results in CUDA_ERROR_NO_DEVICE
|
# numba.cuda.cudadrv.driver:driver.py:384 Call to cuInit results in CUDA_ERROR_NO_DEVICE
|
||||||
if not IS_WINDOWS:
|
if not IS_WINDOWS:
|
||||||
@ -42,8 +48,8 @@ if not IS_WINDOWS:
|
|||||||
f"-I{path}" for path in torch_include_paths(device_type="cpu")
|
f"-I{path}" for path in torch_include_paths(device_type="cpu")
|
||||||
]
|
]
|
||||||
cls.cuda_includes = []
|
cls.cuda_includes = []
|
||||||
if CUDA_HOME:
|
if GPU_HOME:
|
||||||
cuda_include_path = os.path.join(CUDA_HOME, "include")
|
cuda_include_path = os.path.join(GPU_HOME, "include")
|
||||||
if os.path.exists(cuda_include_path):
|
if os.path.exists(cuda_include_path):
|
||||||
cls.cuda_includes = [f"-I{cuda_include_path}"]
|
cls.cuda_includes = [f"-I{cuda_include_path}"]
|
||||||
|
|
||||||
@ -105,13 +111,13 @@ if not IS_WINDOWS:
|
|||||||
Compile a CUDA file with TORCH_TARGET_VERSION=2.9.0.
|
Compile a CUDA file with TORCH_TARGET_VERSION=2.9.0.
|
||||||
Returns (success, error_message).
|
Returns (success, error_message).
|
||||||
"""
|
"""
|
||||||
if not CUDA_HOME:
|
if not GPU_HOME:
|
||||||
return False, "CUDA_HOME not set"
|
return False, "one of CUDA_HOME and ROCM_HOME should be set but is not"
|
||||||
|
|
||||||
torch_version_2_9 = "0x0209000000000000"
|
torch_version_2_9 = "0x0209000000000000"
|
||||||
|
|
||||||
cmd = [
|
cmd = [
|
||||||
os.path.join(CUDA_HOME, "bin", "nvcc"),
|
os.path.join(GPU_HOME, "bin", "nvcc" if CUDA_HOME else "hipcc"),
|
||||||
"-c",
|
"-c",
|
||||||
"-std=c++17",
|
"-std=c++17",
|
||||||
f"-DTORCH_TARGET_VERSION={torch_version_2_9}",
|
f"-DTORCH_TARGET_VERSION={torch_version_2_9}",
|
||||||
@ -120,6 +126,9 @@ if not IS_WINDOWS:
|
|||||||
*self.cuda_includes,
|
*self.cuda_includes,
|
||||||
]
|
]
|
||||||
|
|
||||||
|
if ROCM_HOME:
|
||||||
|
cmd.extend(["-DUSE_ROCM=1"])
|
||||||
|
|
||||||
cmd.extend([str(source_file), "-o", str(output_file)])
|
cmd.extend([str(source_file), "-o", str(output_file)])
|
||||||
|
|
||||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
|
||||||
|
|||||||
@ -1,6 +1,10 @@
|
|||||||
#include "kernel.h"
|
#include "kernel.h"
|
||||||
|
|
||||||
|
#ifdef USE_ROCM
|
||||||
|
#include <hip/hip_runtime.h>
|
||||||
|
#else
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
|
#endif
|
||||||
#include <torch/csrc/stable/library.h>
|
#include <torch/csrc/stable/library.h>
|
||||||
#include <torch/csrc/stable/ops.h>
|
#include <torch/csrc/stable/ops.h>
|
||||||
#include <torch/csrc/stable/tensor.h>
|
#include <torch/csrc/stable/tensor.h>
|
||||||
|
|||||||
@ -711,6 +711,15 @@ if not IS_WINDOWS:
|
|||||||
expected_flat = t.view([-1])
|
expected_flat = t.view([-1])
|
||||||
self.assertEqual(result_flat, expected_flat)
|
self.assertEqual(result_flat, expected_flat)
|
||||||
|
|
||||||
|
@skipIfTorchVersionLessThan(2, 10)
|
||||||
|
def test_my_shape(self, device):
|
||||||
|
import libtorch_agnostic_2_10 as libtorch_agnostic
|
||||||
|
|
||||||
|
expected = (3, 5)
|
||||||
|
t = torch.rand(*expected, device=device)
|
||||||
|
shape = libtorch_agnostic.ops.my_shape(t)
|
||||||
|
self.assertEqual(shape, expected)
|
||||||
|
|
||||||
def test_mv_tensor_accessor(self, device):
|
def test_mv_tensor_accessor(self, device):
|
||||||
import libtorch_agnostic_2_9 as libtorch_agnostic
|
import libtorch_agnostic_2_9 as libtorch_agnostic
|
||||||
|
|
||||||
|
|||||||
@ -428,7 +428,14 @@ class TestFullyShardCommunication(FSDPTest):
|
|||||||
@xfailIf(TEST_XPU) # https://github.com/intel/torch-xpu-ops/issues/1571
|
@xfailIf(TEST_XPU) # https://github.com/intel/torch-xpu-ops/issues/1571
|
||||||
def test_set_reduce_scatter_divide_factor(self):
|
def test_set_reduce_scatter_divide_factor(self):
|
||||||
self.run_subtests(
|
self.run_subtests(
|
||||||
{"divide_factor": [self.world_size * 2, self.world_size]},
|
{
|
||||||
|
"divide_factor": [self.world_size * 2, self.world_size],
|
||||||
|
"mesh_shape": [
|
||||||
|
(self.world_size,),
|
||||||
|
(self.world_size // 2, 2),
|
||||||
|
(self.world_size, 1),
|
||||||
|
],
|
||||||
|
},
|
||||||
self._test_set_reduce_scatter_divide_factor,
|
self._test_set_reduce_scatter_divide_factor,
|
||||||
)
|
)
|
||||||
self.run_subtests(
|
self.run_subtests(
|
||||||
@ -436,18 +443,31 @@ class TestFullyShardCommunication(FSDPTest):
|
|||||||
self._test_set_reduce_scatter_divide_factor_mixed_prevision,
|
self._test_set_reduce_scatter_divide_factor_mixed_prevision,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _test_set_reduce_scatter_divide_factor(self, divide_factor: float):
|
def _test_set_reduce_scatter_divide_factor(
|
||||||
|
self, divide_factor: float, mesh_shape: tuple[int] | tuple[int, int]
|
||||||
|
):
|
||||||
torch.manual_seed(42)
|
torch.manual_seed(42)
|
||||||
model_args = ModelArgs(dropout_p=0.0, weight_tying=False)
|
model_args = ModelArgs(dropout_p=0.0, weight_tying=False)
|
||||||
model = Transformer(model_args)
|
model = Transformer(model_args)
|
||||||
ref_model = copy.deepcopy(model).to(device_type)
|
ref_model = copy.deepcopy(model).to(device_type)
|
||||||
ref_optim = torch.optim.AdamW(ref_model.parameters(), lr=1e-2)
|
ref_optim = torch.optim.AdamW(ref_model.parameters(), lr=1e-2)
|
||||||
|
mesh_dim_names = ("outer",) if len(mesh_shape) == 1 else ("outer", "inner")
|
||||||
|
mesh = init_device_mesh(
|
||||||
|
device_type.type, mesh_shape, mesh_dim_names=mesh_dim_names
|
||||||
|
)
|
||||||
for module in model.modules():
|
for module in model.modules():
|
||||||
if isinstance(module, TransformerBlock):
|
if isinstance(module, TransformerBlock):
|
||||||
fully_shard(module, reshard_after_forward=False)
|
fully_shard(module, reshard_after_forward=False, mesh=mesh)
|
||||||
model = fully_shard(model, reshard_after_forward=False)
|
model = fully_shard(model, reshard_after_forward=False, mesh=mesh)
|
||||||
optim = torch.optim.AdamW(model.parameters(), lr=1e-2)
|
optim = torch.optim.AdamW(model.parameters(), lr=1e-2)
|
||||||
model.set_reduce_scatter_divide_factor(divide_factor)
|
model.set_gradient_divide_factor(divide_factor)
|
||||||
|
|
||||||
|
# Get ref_model params which should have the specific division factor applied
|
||||||
|
block_params = set()
|
||||||
|
for ref_mod in ref_model.modules():
|
||||||
|
if isinstance(ref_mod, TransformerBlock):
|
||||||
|
block_params.update(ref_mod.parameters())
|
||||||
|
non_block_params = set(ref_model.parameters()) - block_params
|
||||||
|
|
||||||
torch.manual_seed(42 + self.rank)
|
torch.manual_seed(42 + self.rank)
|
||||||
inp = torch.randint(0, model_args.vocab_size, (2, 16), device=device_type.type)
|
inp = torch.randint(0, model_args.vocab_size, (2, 16), device=device_type.type)
|
||||||
@ -456,16 +476,18 @@ class TestFullyShardCommunication(FSDPTest):
|
|||||||
ref_loss = ref_model(inp).sum()
|
ref_loss = ref_model(inp).sum()
|
||||||
ref_loss.backward()
|
ref_loss.backward()
|
||||||
for param in ref_model.parameters():
|
for param in ref_model.parameters():
|
||||||
param.grad.mul_(1.0 / divide_factor)
|
factor = divide_factor if param in non_block_params else self.world_size
|
||||||
|
param.grad.mul_(1.0 / factor)
|
||||||
dist.all_reduce(param.grad)
|
dist.all_reduce(param.grad)
|
||||||
loss = model(inp).sum()
|
loss = model(inp).sum()
|
||||||
loss.backward()
|
loss.backward()
|
||||||
ref_optim.step()
|
ref_optim.step()
|
||||||
optim.step()
|
optim.step()
|
||||||
|
self.assertEqual(ref_loss, loss)
|
||||||
|
# Check parity before calling zero_grad so that grads are also checked
|
||||||
|
check_sharded_parity(self, ref_model, model)
|
||||||
ref_optim.zero_grad()
|
ref_optim.zero_grad()
|
||||||
optim.zero_grad()
|
optim.zero_grad()
|
||||||
self.assertEqual(ref_loss, loss)
|
|
||||||
check_sharded_parity(self, ref_model, model)
|
|
||||||
|
|
||||||
def _test_set_reduce_scatter_divide_factor_mixed_prevision(
|
def _test_set_reduce_scatter_divide_factor_mixed_prevision(
|
||||||
self, divide_factor: float
|
self, divide_factor: float
|
||||||
@ -484,7 +506,7 @@ class TestFullyShardCommunication(FSDPTest):
|
|||||||
fully_shard(mlp, mp_policy=mp_policy)
|
fully_shard(mlp, mp_policy=mp_policy)
|
||||||
model = fully_shard(model, mp_policy=mp_policy)
|
model = fully_shard(model, mp_policy=mp_policy)
|
||||||
optim = torch.optim.AdamW(model.parameters(), lr=1e-2)
|
optim = torch.optim.AdamW(model.parameters(), lr=1e-2)
|
||||||
model.set_reduce_scatter_divide_factor(divide_factor)
|
model.set_gradient_divide_factor(divide_factor)
|
||||||
|
|
||||||
torch.manual_seed(42 + self.rank)
|
torch.manual_seed(42 + self.rank)
|
||||||
inp = torch.randn((4, 16), device=device_type.type, dtype=param_dtype)
|
inp = torch.randn((4, 16), device=device_type.type, dtype=param_dtype)
|
||||||
|
|||||||
@ -34,7 +34,11 @@ from torch.distributed.tensor._ops.utils import (
|
|||||||
register_op_strategy,
|
register_op_strategy,
|
||||||
replicate_op_strategy,
|
replicate_op_strategy,
|
||||||
)
|
)
|
||||||
from torch.distributed.tensor.debug import CommDebugMode
|
from torch.distributed.tensor.debug import (
|
||||||
|
_clear_fast_path_sharding_prop_cache,
|
||||||
|
_clear_python_sharding_prop_cache,
|
||||||
|
CommDebugMode,
|
||||||
|
)
|
||||||
from torch.testing._internal.common_utils import run_tests, TestCase
|
from torch.testing._internal.common_utils import run_tests, TestCase
|
||||||
from torch.testing._internal.distributed._tensor.common_dtensor import (
|
from torch.testing._internal.distributed._tensor.common_dtensor import (
|
||||||
create_local_tensor_test_class,
|
create_local_tensor_test_class,
|
||||||
@ -479,7 +483,8 @@ def op_strategy_context(op_overload, strategy_func, schema_info=None):
|
|||||||
del propagator.op_to_schema_info[op_overload]
|
del propagator.op_to_schema_info[op_overload]
|
||||||
else:
|
else:
|
||||||
propagator.op_to_schema_info[op_overload] = _origin_op_strategy_schema
|
propagator.op_to_schema_info[op_overload] = _origin_op_strategy_schema
|
||||||
propagator.propagate_op_sharding.cache.cache_clear()
|
_clear_fast_path_sharding_prop_cache()
|
||||||
|
_clear_python_sharding_prop_cache()
|
||||||
|
|
||||||
|
|
||||||
def detect_exists_identical_opspec(*args, op, mesh, strategy_function) -> bool:
|
def detect_exists_identical_opspec(*args, op, mesh, strategy_function) -> bool:
|
||||||
@ -645,6 +650,28 @@ class TestStrategyHashing(DTensorTestBase):
|
|||||||
self.assertEqual(out1.full_tensor(), out2.full_tensor())
|
self.assertEqual(out1.full_tensor(), out2.full_tensor())
|
||||||
|
|
||||||
|
|
||||||
|
class TestStrategyOperation(DTensorTestBase):
|
||||||
|
@property
|
||||||
|
def world_size(self):
|
||||||
|
return 2
|
||||||
|
|
||||||
|
@with_comms
|
||||||
|
def test_cache_clean(self):
|
||||||
|
mesh = self.build_device_mesh()
|
||||||
|
test_op = torch.ops.mylib.numpy_sin
|
||||||
|
x = torch.randn(2, device=self.device_type)
|
||||||
|
y = torch.randn(2, device=self.device_type)
|
||||||
|
x_dt = distribute_tensor(x, mesh, [Shard(0)])
|
||||||
|
y_dt = distribute_tensor(y, mesh, [Shard(0)])
|
||||||
|
with op_strategy_context(test_op.default, replicate_op_strategy):
|
||||||
|
self._test_op_on_dtensor(test_op, x_dt, y_dt)
|
||||||
|
with self.assertRaisesRegex(
|
||||||
|
NotImplementedError,
|
||||||
|
f"Operator {test_op.default} does not have a sharding strategy registered",
|
||||||
|
):
|
||||||
|
self._test_op_on_dtensor(test_op, x_dt, y_dt)
|
||||||
|
|
||||||
|
|
||||||
DistTensorReplicateStrategyRegistrationTestWithLocalTensor = (
|
DistTensorReplicateStrategyRegistrationTestWithLocalTensor = (
|
||||||
create_local_tensor_test_class(
|
create_local_tensor_test_class(
|
||||||
DistTensorReplicateStrategyRegistrationTest,
|
DistTensorReplicateStrategyRegistrationTest,
|
||||||
|
|||||||
@ -6,8 +6,8 @@ import itertools
|
|||||||
import torch
|
import torch
|
||||||
import torch.distributed._functional_collectives as funcol
|
import torch.distributed._functional_collectives as funcol
|
||||||
import torch.distributed.tensor._random as random
|
import torch.distributed.tensor._random as random
|
||||||
|
from torch.distributed._local_tensor import LocalTensor, maybe_run_for_local_tensor
|
||||||
from torch.distributed.device_mesh import init_device_mesh
|
from torch.distributed.device_mesh import init_device_mesh
|
||||||
from torch.distributed.distributed_c10d import broadcast_object_list
|
|
||||||
from torch.distributed.fsdp import fully_shard
|
from torch.distributed.fsdp import fully_shard
|
||||||
from torch.distributed.tensor import (
|
from torch.distributed.tensor import (
|
||||||
DeviceMesh,
|
DeviceMesh,
|
||||||
@ -26,6 +26,7 @@ from torch.distributed.tensor.debug import CommDebugMode
|
|||||||
from torch.distributed.tensor.parallel import ColwiseParallel, parallelize_module
|
from torch.distributed.tensor.parallel import ColwiseParallel, parallelize_module
|
||||||
from torch.testing._internal.common_utils import run_tests
|
from torch.testing._internal.common_utils import run_tests
|
||||||
from torch.testing._internal.distributed._tensor.common_dtensor import (
|
from torch.testing._internal.distributed._tensor.common_dtensor import (
|
||||||
|
create_local_tensor_test_class,
|
||||||
DTensorTestBase,
|
DTensorTestBase,
|
||||||
skip_if_lt_x_gpu,
|
skip_if_lt_x_gpu,
|
||||||
skip_unless_torch_gpu,
|
skip_unless_torch_gpu,
|
||||||
@ -34,9 +35,12 @@ from torch.testing._internal.distributed._tensor.common_dtensor import (
|
|||||||
from torch.utils._typing_utils import not_none
|
from torch.utils._typing_utils import not_none
|
||||||
|
|
||||||
|
|
||||||
def get_generator_seed_for_device_type(device_type: str) -> int:
|
def get_generator_seed_for_device_type(device_type: str):
|
||||||
device_module = torch.get_device_module(device_type)
|
from torch.distributed._local_tensor import (
|
||||||
return device_module.get_rng_state()[:8].view(torch.int64).item()
|
get_generator_seed_for_device_type as _get_seed,
|
||||||
|
)
|
||||||
|
|
||||||
|
return _get_seed(device_type)
|
||||||
|
|
||||||
|
|
||||||
class DistTensorRandomInitTest(DTensorTestBase):
|
class DistTensorRandomInitTest(DTensorTestBase):
|
||||||
@ -134,9 +138,6 @@ class DistTensorRandomInitTest(DTensorTestBase):
|
|||||||
torch.empty(*size, device="meta"), device_mesh, [Replicate()]
|
torch.empty(*size, device="meta"), device_mesh, [Replicate()]
|
||||||
)
|
)
|
||||||
|
|
||||||
# the tensor slice on the current rank
|
|
||||||
self_slice = slice(1024 * self.rank, 1024 * self.rank + 1024)
|
|
||||||
|
|
||||||
# Test 1: enable the distribute region for RNG (by default)
|
# Test 1: enable the distribute region for RNG (by default)
|
||||||
self.assertTrue(meta_dtensor.is_meta)
|
self.assertTrue(meta_dtensor.is_meta)
|
||||||
# Tensor meta init
|
# Tensor meta init
|
||||||
@ -150,16 +151,23 @@ class DistTensorRandomInitTest(DTensorTestBase):
|
|||||||
dtensor.to_local(), gather_dim=0, group=(device_mesh, 0)
|
dtensor.to_local(), gather_dim=0, group=(device_mesh, 0)
|
||||||
)
|
)
|
||||||
|
|
||||||
# compare with local tensors from other ranks
|
@maybe_run_for_local_tensor
|
||||||
for other_rank in range(self.world_size):
|
def compute_rankwise_if_local_tensor(gathered_local_tensors, rank):
|
||||||
# the RNG result on each rank are the same because they're replicated
|
# the tensor slice on the current rank
|
||||||
if self.rank != other_rank:
|
self_slice = slice(1024 * rank, 1024 * rank + 1024)
|
||||||
# other rank should have an identical local tensor
|
|
||||||
other_slice = slice(1024 * other_rank, 1024 * other_rank + 1024)
|
# compare with local tensors from other ranks
|
||||||
self.assertEqual(
|
for other_rank in range(self.world_size):
|
||||||
gathered_local_tensors[self_slice, :],
|
# the RNG result on each rank are the same because they're replicated
|
||||||
gathered_local_tensors[other_slice, :],
|
if rank != other_rank:
|
||||||
)
|
# other rank should have an identical local tensor
|
||||||
|
other_slice = slice(1024 * other_rank, 1024 * other_rank + 1024)
|
||||||
|
self.assertEqual(
|
||||||
|
gathered_local_tensors[self_slice, :],
|
||||||
|
gathered_local_tensors[other_slice, :],
|
||||||
|
)
|
||||||
|
|
||||||
|
compute_rankwise_if_local_tensor(gathered_local_tensors.wait(), self.rank)
|
||||||
|
|
||||||
# Test 2: disable the distribute region for RNG
|
# Test 2: disable the distribute region for RNG
|
||||||
self.assertTrue(meta_dtensor.is_meta)
|
self.assertTrue(meta_dtensor.is_meta)
|
||||||
@ -175,15 +183,7 @@ class DistTensorRandomInitTest(DTensorTestBase):
|
|||||||
dtensor.to_local(), gather_dim=0, group=(device_mesh, 0)
|
dtensor.to_local(), gather_dim=0, group=(device_mesh, 0)
|
||||||
)
|
)
|
||||||
|
|
||||||
# compare with local tensors from other ranks
|
compute_rankwise_if_local_tensor(local_tensor.wait(), self.rank)
|
||||||
for other_rank in range(self.world_size):
|
|
||||||
# the RNG result on each rank are the same even without the help of DTensor's RNG infra,
|
|
||||||
# since the default RNG is the same across ranks.
|
|
||||||
if self.rank != other_rank:
|
|
||||||
other_slice = slice(1024 * other_rank, 1024 * other_rank + 1024)
|
|
||||||
self.assertEqual(
|
|
||||||
local_tensor[self_slice, :], local_tensor[other_slice, :]
|
|
||||||
)
|
|
||||||
|
|
||||||
@with_comms
|
@with_comms
|
||||||
@skip_unless_torch_gpu
|
@skip_unless_torch_gpu
|
||||||
@ -224,13 +224,17 @@ class DistTensorRandomInitTest(DTensorTestBase):
|
|||||||
group=WORLD,
|
group=WORLD,
|
||||||
)
|
)
|
||||||
|
|
||||||
# verify the weights are initialized differently on all ranks
|
@maybe_run_for_local_tensor
|
||||||
for other_rank in range(self.world_size):
|
def compute_rankwise_if_local_tensor(weight_local, weight_gather, rank):
|
||||||
if self.rank != other_rank:
|
# verify the weights are initialized differently on all ranks
|
||||||
self.assertNotEqual(
|
for other_rank in range(self.world_size):
|
||||||
weight_local,
|
if rank != other_rank:
|
||||||
weight_gather[other_rank : other_rank + 1, :],
|
self.assertNotEqual(
|
||||||
)
|
weight_local,
|
||||||
|
weight_gather[other_rank : other_rank + 1, :],
|
||||||
|
)
|
||||||
|
|
||||||
|
compute_rankwise_if_local_tensor(weight_local, weight_gather.wait(), self.rank)
|
||||||
|
|
||||||
@with_comms
|
@with_comms
|
||||||
@skip_if_lt_x_gpu(4)
|
@skip_if_lt_x_gpu(4)
|
||||||
@ -277,13 +281,17 @@ class DistTensorRandomInitTest(DTensorTestBase):
|
|||||||
group=WORLD,
|
group=WORLD,
|
||||||
)
|
)
|
||||||
|
|
||||||
# verify the weights are initialized differently on all ranks
|
@maybe_run_for_local_tensor
|
||||||
for other_rank in range(self.world_size):
|
def compute_rankwise_if_local_tensor(weight_local, weight_gather, rank):
|
||||||
if self.rank != other_rank:
|
# verify the weights are initialized differently on all ranks
|
||||||
self.assertNotEqual(
|
for other_rank in range(self.world_size):
|
||||||
weight_local,
|
if rank != other_rank:
|
||||||
weight_gather[other_rank : other_rank + 1, :],
|
self.assertNotEqual(
|
||||||
)
|
weight_local,
|
||||||
|
weight_gather[other_rank : other_rank + 1, :],
|
||||||
|
)
|
||||||
|
|
||||||
|
compute_rankwise_if_local_tensor(weight_local, weight_gather.wait(), self.rank)
|
||||||
|
|
||||||
|
|
||||||
class DistTensorRandomOpTest(DTensorTestBase):
|
class DistTensorRandomOpTest(DTensorTestBase):
|
||||||
@ -291,9 +299,14 @@ class DistTensorRandomOpTest(DTensorTestBase):
|
|||||||
@skip_unless_torch_gpu
|
@skip_unless_torch_gpu
|
||||||
def test_rng_tracker_init(self):
|
def test_rng_tracker_init(self):
|
||||||
torch.manual_seed(self.rank)
|
torch.manual_seed(self.rank)
|
||||||
object_list = [torch.initial_seed()]
|
seed_local = (
|
||||||
broadcast_object_list(object_list)
|
torch.zeros_like(torch.empty(1), device=self.device_type)
|
||||||
seed_from_rank_0 = int(object_list[0])
|
+ torch.initial_seed()
|
||||||
|
)
|
||||||
|
torch.distributed.broadcast(seed_local, src=0)
|
||||||
|
# if localtensor, it should automaticall reconcile after the broadcast
|
||||||
|
# since all virtual ranks should have rank 0's initial_seed()
|
||||||
|
seed_from_rank_0 = seed_local
|
||||||
|
|
||||||
device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
|
device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
|
||||||
# seed synchronization now does NOT happen after the first `distribute_tensor`
|
# seed synchronization now does NOT happen after the first `distribute_tensor`
|
||||||
@ -344,15 +357,19 @@ class DistTensorRandomOpTest(DTensorTestBase):
|
|||||||
@with_comms
|
@with_comms
|
||||||
@skip_unless_torch_gpu
|
@skip_unless_torch_gpu
|
||||||
def test_manual_seed_submesh(self):
|
def test_manual_seed_submesh(self):
|
||||||
# the current rank is not a part of the mesh
|
@maybe_run_for_local_tensor
|
||||||
single_rank_device_mesh = DeviceMesh(
|
def compute_rankwise_if_local_tensor(rank):
|
||||||
self.device_type, [(self.rank + 1) % self.world_size]
|
# the current rank is not a part of the mesh
|
||||||
)
|
single_rank_device_mesh = DeviceMesh(
|
||||||
with self.assertRaisesRegex(
|
self.device_type, [(rank + 1) % self.world_size], _rank=rank
|
||||||
RuntimeError,
|
)
|
||||||
"manual_seed requires the current rank to be a part of the device mesh",
|
with self.assertRaisesRegex(
|
||||||
):
|
RuntimeError,
|
||||||
manual_seed(self.rank, single_rank_device_mesh)
|
"manual_seed requires the current rank to be a part of the device mesh",
|
||||||
|
):
|
||||||
|
manual_seed(rank, single_rank_device_mesh)
|
||||||
|
|
||||||
|
compute_rankwise_if_local_tensor(self.rank)
|
||||||
|
|
||||||
@with_comms
|
@with_comms
|
||||||
@skip_unless_torch_gpu
|
@skip_unless_torch_gpu
|
||||||
@ -394,7 +411,7 @@ class DistTensorRandomOpTest(DTensorTestBase):
|
|||||||
for other_rank in range(self.world_size):
|
for other_rank in range(self.world_size):
|
||||||
if self.rank != other_rank:
|
if self.rank != other_rank:
|
||||||
self.assertNotEqual(
|
self.assertNotEqual(
|
||||||
spmd_dtensor.to_local(),
|
spmd_dtensor,
|
||||||
tensor_gather[2 * other_rank : 2 * (other_rank + 1), :],
|
tensor_gather[2 * other_rank : 2 * (other_rank + 1), :],
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -428,16 +445,20 @@ class DistTensorRandomOpTest(DTensorTestBase):
|
|||||||
dtensor.to_local(), gather_dim=0, group=(device_mesh, 0)
|
dtensor.to_local(), gather_dim=0, group=(device_mesh, 0)
|
||||||
)
|
)
|
||||||
|
|
||||||
# compare with local tensors from other ranks
|
@maybe_run_for_local_tensor
|
||||||
self_slice = slice(4 * self.rank, 4 * self.rank + 4)
|
def compute_rankwise_if_local_tensor(local_tensor, rank):
|
||||||
for other_rank in range(self.world_size):
|
# compare with local tensors from other ranks
|
||||||
if self.rank != other_rank:
|
self_slice = slice(4 * rank, 4 * rank + 4)
|
||||||
# other rank should have an identical local tensor
|
for other_rank in range(self.world_size):
|
||||||
other_slice = slice(4 * other_rank, 4 * other_rank + 4)
|
if rank != other_rank:
|
||||||
self.assertEqual(
|
# other rank should have an identical local tensor
|
||||||
local_tensor[self_slice, :],
|
other_slice = slice(4 * other_rank, 4 * other_rank + 4)
|
||||||
local_tensor[other_slice, :],
|
self.assertEqual(
|
||||||
)
|
local_tensor[self_slice, :],
|
||||||
|
local_tensor[other_slice, :],
|
||||||
|
)
|
||||||
|
|
||||||
|
compute_rankwise_if_local_tensor(local_tensor, self.rank)
|
||||||
|
|
||||||
@with_comms
|
@with_comms
|
||||||
@skip_unless_torch_gpu
|
@skip_unless_torch_gpu
|
||||||
@ -454,16 +475,20 @@ class DistTensorRandomOpTest(DTensorTestBase):
|
|||||||
dtensor.to_local(), gather_dim=0, group=(device_mesh, 0)
|
dtensor.to_local(), gather_dim=0, group=(device_mesh, 0)
|
||||||
)
|
)
|
||||||
|
|
||||||
# compare with local tensors from other ranks
|
@maybe_run_for_local_tensor
|
||||||
self_slice = slice(4 * self.rank, 4 * self.rank + 4)
|
def compute_rankwise_if_local_tensor(local_tensor, rank):
|
||||||
for other_rank in range(self.world_size):
|
# compare with local tensors from other ranks
|
||||||
if self.rank != other_rank:
|
self_slice = slice(4 * rank, 4 * rank + 4)
|
||||||
# other rank should have a different local tensor for shard placement
|
for other_rank in range(self.world_size):
|
||||||
other_slice = slice(4 * other_rank, 4 * other_rank + 4)
|
if rank != other_rank:
|
||||||
self.assertNotEqual(
|
# other rank should have an identical local tensor for replicate placement
|
||||||
local_tensor[self_slice, :],
|
other_slice = slice(4 * other_rank, 4 * other_rank + 4)
|
||||||
local_tensor[other_slice, :],
|
self.assertNotEqual(
|
||||||
)
|
local_tensor[self_slice, :],
|
||||||
|
local_tensor[other_slice, :],
|
||||||
|
)
|
||||||
|
|
||||||
|
compute_rankwise_if_local_tensor(local_tensor, self.rank)
|
||||||
|
|
||||||
# we should set manual seed to the same value on all SPMD ranks
|
# we should set manual seed to the same value on all SPMD ranks
|
||||||
torch.manual_seed(0)
|
torch.manual_seed(0)
|
||||||
@ -472,16 +497,20 @@ class DistTensorRandomOpTest(DTensorTestBase):
|
|||||||
dtensor.to_local(), gather_dim=0, group=(device_mesh, 0)
|
dtensor.to_local(), gather_dim=0, group=(device_mesh, 0)
|
||||||
)
|
)
|
||||||
|
|
||||||
# compare with local tensors from other ranks
|
@maybe_run_for_local_tensor
|
||||||
self_slice = slice(4 * self.rank, 4 * self.rank + 4)
|
def compute_rankwise_if_local_tensor(local_tensor, rank):
|
||||||
for other_rank in range(self.world_size):
|
# compare with local tensors from other ranks
|
||||||
if self.rank != other_rank:
|
self_slice = slice(4 * rank, 4 * rank + 4)
|
||||||
# other rank should have an identical local tensor for replicate placement
|
for other_rank in range(self.world_size):
|
||||||
other_slice = slice(4 * other_rank, 4 * other_rank + 4)
|
if rank != other_rank:
|
||||||
self.assertEqual(
|
# other rank should have an identical local tensor for replicate placement
|
||||||
local_tensor[self_slice, :],
|
other_slice = slice(4 * other_rank, 4 * other_rank + 4)
|
||||||
local_tensor[other_slice, :],
|
self.assertEqual(
|
||||||
)
|
local_tensor[self_slice, :],
|
||||||
|
local_tensor[other_slice, :],
|
||||||
|
)
|
||||||
|
|
||||||
|
compute_rankwise_if_local_tensor(local_tensor, self.rank)
|
||||||
|
|
||||||
@with_comms
|
@with_comms
|
||||||
@skip_if_lt_x_gpu(4)
|
@skip_if_lt_x_gpu(4)
|
||||||
@ -539,7 +568,12 @@ class DistTensorRandomOpTest(DTensorTestBase):
|
|||||||
shard_linear_idx = random._rng_tracker._calc_shard_linear_idx(
|
shard_linear_idx = random._rng_tracker._calc_shard_linear_idx(
|
||||||
shard_coord, shard_size
|
shard_coord, shard_size
|
||||||
)
|
)
|
||||||
self.assertEqual(shard_linear_idx, shard_index[self.rank])
|
|
||||||
|
@maybe_run_for_local_tensor
|
||||||
|
def check_shard_index(shard_linear_idx, rank):
|
||||||
|
self.assertEqual(shard_linear_idx, shard_index[rank])
|
||||||
|
|
||||||
|
check_shard_index(shard_linear_idx, self.rank)
|
||||||
|
|
||||||
# compute local size and offset
|
# compute local size and offset
|
||||||
_, local_shard_offset = compute_local_shape_and_global_offset(
|
_, local_shard_offset = compute_local_shape_and_global_offset(
|
||||||
@ -578,16 +612,27 @@ class DistTensorRandomOpTest(DTensorTestBase):
|
|||||||
# allgather the local tensors
|
# allgather the local tensors
|
||||||
full_tensor = dtensor.full_tensor()
|
full_tensor = dtensor.full_tensor()
|
||||||
|
|
||||||
# compare local tensor with each other shard
|
full_tensor = (
|
||||||
for other_local_shard in local_shard_comb:
|
full_tensor.reconcile()
|
||||||
other_local_shard_offset, _ = zip(*other_local_shard)
|
if isinstance(full_tensor, LocalTensor)
|
||||||
slice_idx = [
|
else full_tensor
|
||||||
slice(offset, offset + size) for offset, size in other_local_shard
|
)
|
||||||
]
|
|
||||||
if local_shard_offset == other_local_shard_offset:
|
@maybe_run_for_local_tensor
|
||||||
self.assertEqual(full_tensor[tuple(slice_idx)], local_tensor)
|
def blockwise_iter_if_localtensor(local_tensor, local_shard_offset):
|
||||||
else:
|
# compare local tensor with each other shard
|
||||||
self.assertNotEqual(full_tensor[tuple(slice_idx)], local_tensor)
|
for other_local_shard in local_shard_comb:
|
||||||
|
other_local_shard_offset, _ = zip(*other_local_shard)
|
||||||
|
slice_idx = [
|
||||||
|
slice(offset, offset + size)
|
||||||
|
for offset, size in other_local_shard
|
||||||
|
]
|
||||||
|
if local_shard_offset == other_local_shard_offset:
|
||||||
|
self.assertEqual(full_tensor[tuple(slice_idx)], local_tensor)
|
||||||
|
else:
|
||||||
|
self.assertNotEqual(full_tensor[tuple(slice_idx)], local_tensor)
|
||||||
|
|
||||||
|
blockwise_iter_if_localtensor(local_tensor, local_shard_offset)
|
||||||
|
|
||||||
|
|
||||||
class DistTensorRandomOpsTest3D(DTensorTestBase):
|
class DistTensorRandomOpsTest3D(DTensorTestBase):
|
||||||
@ -641,22 +686,46 @@ class DistTensorRandomOpsTest3D(DTensorTestBase):
|
|||||||
group=WORLD,
|
group=WORLD,
|
||||||
)
|
)
|
||||||
|
|
||||||
# verify the weights are initialized differently on all ranks
|
weight_gather = weight_gather.wait()
|
||||||
shard_dim_0_len = self.world_size // 4
|
|
||||||
for other_rank in range(self.world_size):
|
|
||||||
other_rank_dim_0_start = other_rank * shard_dim_0_len
|
|
||||||
other_rank_dim_0_end = other_rank_dim_0_start + shard_dim_0_len
|
|
||||||
if self.rank % 4 != other_rank % 4:
|
|
||||||
self.assertNotEqual(
|
|
||||||
weight_local,
|
|
||||||
weight_gather[other_rank_dim_0_start:other_rank_dim_0_end, :],
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
self.assertEqual(
|
|
||||||
weight_local,
|
|
||||||
weight_gather[other_rank_dim_0_start:other_rank_dim_0_end, :],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
weight_gather = (
|
||||||
|
weight_gather.reconcile()
|
||||||
|
if isinstance(weight_gather, LocalTensor)
|
||||||
|
else weight_gather
|
||||||
|
)
|
||||||
|
|
||||||
|
@maybe_run_for_local_tensor
|
||||||
|
def compute_rankwise_if_local_tensor(weight_local, rank):
|
||||||
|
# verify the weights are initialized differently on all ranks
|
||||||
|
shard_dim_0_len = self.world_size // 4
|
||||||
|
for other_rank in range(self.world_size):
|
||||||
|
other_rank_dim_0_start = other_rank * shard_dim_0_len
|
||||||
|
other_rank_dim_0_end = other_rank_dim_0_start + shard_dim_0_len
|
||||||
|
if rank % 4 != other_rank % 4:
|
||||||
|
self.assertNotEqual(
|
||||||
|
weight_local,
|
||||||
|
weight_gather[other_rank_dim_0_start:other_rank_dim_0_end, :],
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.assertEqual(
|
||||||
|
weight_local,
|
||||||
|
weight_gather[other_rank_dim_0_start:other_rank_dim_0_end, :],
|
||||||
|
)
|
||||||
|
|
||||||
|
compute_rankwise_if_local_tensor(weight_local, self.rank)
|
||||||
|
|
||||||
|
|
||||||
|
DistTensorRandomInitTestWithLocalTensor = create_local_tensor_test_class(
|
||||||
|
DistTensorRandomInitTest,
|
||||||
|
)
|
||||||
|
|
||||||
|
DistTensorRandomOpTestWithLocalTensor = create_local_tensor_test_class(
|
||||||
|
DistTensorRandomOpTest,
|
||||||
|
)
|
||||||
|
|
||||||
|
DistTensorRandomOpsTest3DWithLocalTensor = create_local_tensor_test_class(
|
||||||
|
DistTensorRandomOpsTest3D,
|
||||||
|
)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
run_tests()
|
run_tests()
|
||||||
|
|||||||
@ -12,7 +12,6 @@ import torch.distributed._symmetric_memory as symm_mem
|
|||||||
import torch.distributed._symmetric_memory._nvshmem_triton as nvshmem
|
import torch.distributed._symmetric_memory._nvshmem_triton as nvshmem
|
||||||
from torch._inductor.runtime.triton_compat import triton
|
from torch._inductor.runtime.triton_compat import triton
|
||||||
from torch.distributed._symmetric_memory._nvshmem_triton import requires_nvshmem
|
from torch.distributed._symmetric_memory._nvshmem_triton import requires_nvshmem
|
||||||
from torch.testing._internal.common_cuda import SM100OrLater
|
|
||||||
from torch.testing._internal.common_distributed import MultiProcContinuousTest
|
from torch.testing._internal.common_distributed import MultiProcContinuousTest
|
||||||
from torch.testing._internal.common_utils import (
|
from torch.testing._internal.common_utils import (
|
||||||
instantiate_parametrized_tests,
|
instantiate_parametrized_tests,
|
||||||
@ -265,10 +264,6 @@ def my_reduce_kernel(
|
|||||||
nvshmem.reduce(team_handle, dest_tensor, source_tensor, nreduce, operation)
|
nvshmem.reduce(team_handle, dest_tensor, source_tensor, nreduce, operation)
|
||||||
|
|
||||||
|
|
||||||
@skip_but_pass_in_sandcastle_if(
|
|
||||||
SM100OrLater,
|
|
||||||
"Skipping all NVSHMEM Triton tests due to https://github.com/pytorch/pytorch/issues/162897",
|
|
||||||
)
|
|
||||||
@instantiate_parametrized_tests
|
@instantiate_parametrized_tests
|
||||||
class NVSHMEMTritonTest(MultiProcContinuousTest):
|
class NVSHMEMTritonTest(MultiProcContinuousTest):
|
||||||
def _init_device(self) -> None:
|
def _init_device(self) -> None:
|
||||||
|
|||||||
@ -585,6 +585,10 @@ class GraphModule(torch.nn.Module):
|
|||||||
# Annotation: {'stream': 1}
|
# Annotation: {'stream': 1}
|
||||||
mul_3: "f32[2, 2]" = torch.ops.aten.mul.Tensor(tangents_1, 2); tangents_1 = None
|
mul_3: "f32[2, 2]" = torch.ops.aten.mul.Tensor(tangents_1, 2); tangents_1 = None
|
||||||
|
|
||||||
|
# No stacktrace found for following nodes
|
||||||
|
record_event_default = torch.ops.streams.record_event.default(2, 1); record_event_default = None
|
||||||
|
wait_event_default = torch.ops.streams.wait_event.default(2, 0); wait_event_default = None
|
||||||
|
|
||||||
# Annotation: {'stream': 0}
|
# Annotation: {'stream': 0}
|
||||||
add_3: "f32[2, 2]" = torch.ops.aten.add.Tensor(mul_2, mul_3); mul_2 = mul_3 = None
|
add_3: "f32[2, 2]" = torch.ops.aten.add.Tensor(mul_2, mul_3); mul_2 = mul_3 = None
|
||||||
return (add_3, add_2)
|
return (add_3, add_2)
|
||||||
|
|||||||
@ -1405,7 +1405,7 @@ class TestConverter(TestCase):
|
|||||||
)
|
)
|
||||||
# qnnpack not supported on s390x
|
# qnnpack not supported on s390x
|
||||||
@xfailIfS390X
|
@xfailIfS390X
|
||||||
def test_ts2ep_convert_quantized_model(self):
|
def test_ts2ep_convert_quantized_model1(self):
|
||||||
class Standalone(torch.nn.Module):
|
class Standalone(torch.nn.Module):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|||||||
@ -640,16 +640,13 @@ class TestPasses(TestCase):
|
|||||||
self.assertExpectedInline(
|
self.assertExpectedInline(
|
||||||
without_token_ep.graph_module.code.strip(),
|
without_token_ep.graph_module.code.strip(),
|
||||||
"""\
|
"""\
|
||||||
def forward(self, token, obj_attr, x):
|
def forward(self, obj_attr, x):
|
||||||
with_effects = torch.ops.higher_order.with_effects(token, torch.ops._TorchScriptTesting.takes_foo_tuple_return.default, foo = obj_attr, x = x); token = x = None
|
takes_foo_tuple_return_default = torch.ops._TorchScriptTesting.takes_foo_tuple_return.default(foo = obj_attr, x = x); x = None
|
||||||
getitem = with_effects[0]
|
getitem_1 = takes_foo_tuple_return_default[0]
|
||||||
getitem_1 = with_effects[1]
|
getitem_2 = takes_foo_tuple_return_default[1]; takes_foo_tuple_return_default = None
|
||||||
getitem_2 = with_effects[2]; with_effects = None
|
|
||||||
add = torch.ops.aten.add.Tensor(getitem_1, getitem_2); getitem_1 = getitem_2 = None
|
add = torch.ops.aten.add.Tensor(getitem_1, getitem_2); getitem_1 = getitem_2 = None
|
||||||
with_effects_1 = torch.ops.higher_order.with_effects(getitem, torch.ops._TorchScriptTesting.takes_foo.default, foo = obj_attr, x = add); getitem = obj_attr = add = None
|
takes_foo_default = torch.ops._TorchScriptTesting.takes_foo.default(foo = obj_attr, x = add); obj_attr = add = None
|
||||||
getitem_3 = with_effects_1[0]
|
return (takes_foo_default,)""", # noqa: B950
|
||||||
getitem_4 = with_effects_1[1]; with_effects_1 = None
|
|
||||||
return (getitem_3, getitem_4)""", # noqa: B950
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_fakify_script_objects(self):
|
def test_fakify_script_objects(self):
|
||||||
|
|||||||
@ -461,9 +461,9 @@ def forward(self, x):
|
|||||||
x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
|
x, = fx_pytree.tree_flatten_spec(([x], {}), self._in_spec)
|
||||||
attr = self.attr
|
attr = self.attr
|
||||||
_guards_fn = self._guards_fn(x); _guards_fn = None
|
_guards_fn = self._guards_fn(x); _guards_fn = None
|
||||||
takes_foo_default_1 = torch.ops._TorchScriptTesting.takes_foo.default(attr, x)
|
takes_foo_default = torch.ops._TorchScriptTesting.takes_foo.default(attr, x)
|
||||||
takes_foo_default = torch.ops._TorchScriptTesting.takes_foo.default(attr, takes_foo_default_1); attr = takes_foo_default_1 = None
|
takes_foo_default_1 = torch.ops._TorchScriptTesting.takes_foo.default(attr, takes_foo_default); attr = takes_foo_default = None
|
||||||
add = torch.ops.aten.add.Tensor(x, takes_foo_default); x = takes_foo_default = None
|
add = torch.ops.aten.add.Tensor(x, takes_foo_default_1); x = takes_foo_default_1 = None
|
||||||
return pytree.tree_unflatten((add,), self._out_spec)""", # noqa: B950
|
return pytree.tree_unflatten((add,), self._out_spec)""", # noqa: B950
|
||||||
)
|
)
|
||||||
self.assertExpectedInline(
|
self.assertExpectedInline(
|
||||||
@ -1087,10 +1087,12 @@ def forward(self, token, tq, x):
|
|||||||
str(ep.graph_module.graph).strip(),
|
str(ep.graph_module.graph).strip(),
|
||||||
"""\
|
"""\
|
||||||
graph():
|
graph():
|
||||||
|
%token : [num_users=1] = placeholder[target=token]
|
||||||
%tq : [num_users=2] = placeholder[target=tq]
|
%tq : [num_users=2] = placeholder[target=tq]
|
||||||
%x : [num_users=1] = placeholder[target=x]
|
%x : [num_users=1] = placeholder[target=x]
|
||||||
%queue_push_default : [num_users=0] = call_function[target=torch.ops._TorchScriptTesting.queue_push.default](args = (%tq, %x), kwargs = {})
|
%with_effects : [num_users=1] = call_function[target=torch.ops.higher_order.with_effects](args = (%token, _TorchScriptTesting.queue_push.default, %tq, %x), kwargs = {})
|
||||||
return (tq,)""", # noqa: B950
|
%getitem : [num_users=1] = call_function[target=operator.getitem](args = (%with_effects, 0), kwargs = {})
|
||||||
|
return (getitem, tq)""", # noqa: B950
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_deepcopy(self):
|
def test_deepcopy(self):
|
||||||
|
|||||||
@ -870,6 +870,100 @@ def forward(self, primals_2, getitem_1, tangents_1, tangents_token):
|
|||||||
finally:
|
finally:
|
||||||
handle.destroy()
|
handle.destroy()
|
||||||
|
|
||||||
|
@unittest.skipIf(not TEST_CUDA, "triton")
|
||||||
|
def test_export_invoke_subgraph(self):
|
||||||
|
with torch.library._scoped_library("mylib", "FRAGMENT") as lib:
|
||||||
|
recorded_list = []
|
||||||
|
|
||||||
|
@torch.library.custom_op("mylib::record_memory", mutates_args=())
|
||||||
|
def record_memory(prefix: str, module_name: str) -> None:
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
mem_alloc = torch.cuda.memory_allocated() / 1024**2
|
||||||
|
mem_reserved = torch.cuda.memory_reserved() / 1024**2
|
||||||
|
memory_str = f"[{prefix}] {module_name}: allocated={mem_alloc:.2f} MB, reserved={mem_reserved:.2f} MB"
|
||||||
|
recorded_list.append(memory_str)
|
||||||
|
|
||||||
|
@record_memory.register_fake
|
||||||
|
def record_memory_fake(prefix, module_name):
|
||||||
|
return
|
||||||
|
|
||||||
|
record_memory.register_effect(_EffectType.ORDERED)
|
||||||
|
|
||||||
|
class N(torch.nn.Module):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.linear1 = torch.nn.Linear(1024, 1024)
|
||||||
|
self.relu = torch.nn.ReLU()
|
||||||
|
self.linear2 = torch.nn.Linear(1024, 1024)
|
||||||
|
|
||||||
|
@torch.compiler.nested_compile_region
|
||||||
|
def forward(self, x):
|
||||||
|
torch.ops.mylib.record_memory("forward", "N")
|
||||||
|
x = self.linear1(x)
|
||||||
|
x = self.relu(x)
|
||||||
|
x = self.linear2(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
class M(torch.nn.Module):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.mod_list = torch.nn.ModuleList(N() for _ in range(3))
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
for m in self.mod_list:
|
||||||
|
x = m(x)
|
||||||
|
torch.ops.mylib.record_memory("forward", "N")
|
||||||
|
return (x,)
|
||||||
|
|
||||||
|
model = M().to("cuda")
|
||||||
|
torch.cuda.reset_peak_memory_stats()
|
||||||
|
|
||||||
|
x = torch.randn(32, 1024, requires_grad=True, device="cuda")
|
||||||
|
|
||||||
|
ep = torch.export.export(model, (x,))
|
||||||
|
ep = ep.run_decompositions()
|
||||||
|
self.assertEqual(len(list(ep.graph_module.named_modules())), 2)
|
||||||
|
|
||||||
|
self.assertExpectedInline(
|
||||||
|
ep.graph_module.code.strip(),
|
||||||
|
"""\
|
||||||
|
def forward(self, token, p_mod_list_0_linear1_weight, p_mod_list_0_linear1_bias, p_mod_list_0_linear2_weight, p_mod_list_0_linear2_bias, p_mod_list_1_linear1_weight, p_mod_list_1_linear1_bias, p_mod_list_1_linear2_weight, p_mod_list_1_linear2_bias, p_mod_list_2_linear1_weight, p_mod_list_2_linear1_bias, p_mod_list_2_linear2_weight, p_mod_list_2_linear2_bias, x):
|
||||||
|
repeated_subgraph0 = self.repeated_subgraph0
|
||||||
|
invoke_subgraph = torch.ops.higher_order.invoke_subgraph(repeated_subgraph0, 'subgraph_0', token, x, p_mod_list_0_linear1_weight, p_mod_list_0_linear1_bias, p_mod_list_0_linear2_weight, p_mod_list_0_linear2_bias); repeated_subgraph0 = token = x = p_mod_list_0_linear1_weight = p_mod_list_0_linear1_bias = p_mod_list_0_linear2_weight = p_mod_list_0_linear2_bias = None
|
||||||
|
getitem = invoke_subgraph[0]
|
||||||
|
getitem_1 = invoke_subgraph[1]; invoke_subgraph = None
|
||||||
|
repeated_subgraph0_1 = self.repeated_subgraph0
|
||||||
|
invoke_subgraph_1 = torch.ops.higher_order.invoke_subgraph(repeated_subgraph0_1, 'subgraph_0', getitem, getitem_1, p_mod_list_1_linear1_weight, p_mod_list_1_linear1_bias, p_mod_list_1_linear2_weight, p_mod_list_1_linear2_bias); repeated_subgraph0_1 = getitem = getitem_1 = p_mod_list_1_linear1_weight = p_mod_list_1_linear1_bias = p_mod_list_1_linear2_weight = p_mod_list_1_linear2_bias = None
|
||||||
|
getitem_2 = invoke_subgraph_1[0]
|
||||||
|
getitem_3 = invoke_subgraph_1[1]; invoke_subgraph_1 = None
|
||||||
|
repeated_subgraph0_2 = self.repeated_subgraph0
|
||||||
|
invoke_subgraph_2 = torch.ops.higher_order.invoke_subgraph(repeated_subgraph0_2, 'subgraph_0', getitem_2, getitem_3, p_mod_list_2_linear1_weight, p_mod_list_2_linear1_bias, p_mod_list_2_linear2_weight, p_mod_list_2_linear2_bias); repeated_subgraph0_2 = getitem_2 = getitem_3 = p_mod_list_2_linear1_weight = p_mod_list_2_linear1_bias = p_mod_list_2_linear2_weight = p_mod_list_2_linear2_bias = None
|
||||||
|
getitem_4 = invoke_subgraph_2[0]
|
||||||
|
getitem_5 = invoke_subgraph_2[1]; invoke_subgraph_2 = None
|
||||||
|
with_effects = torch.ops.higher_order.with_effects(getitem_4, torch.ops.mylib.record_memory.default, 'forward', 'N'); getitem_4 = None
|
||||||
|
getitem_6 = with_effects[0]; with_effects = None
|
||||||
|
return (getitem_6, getitem_5)""",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertExpectedInline(
|
||||||
|
ep.graph_module.repeated_subgraph0.code.strip(),
|
||||||
|
"""\
|
||||||
|
def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1):
|
||||||
|
with_effects = torch.ops.higher_order.with_effects(arg0_1, torch.ops.mylib.record_memory.default, 'forward', 'N'); arg0_1 = None
|
||||||
|
getitem = with_effects[0]; with_effects = None
|
||||||
|
permute = torch.ops.aten.permute.default(arg2_1, [1, 0]); arg2_1 = None
|
||||||
|
addmm = torch.ops.aten.addmm.default(arg3_1, arg1_1, permute); arg3_1 = arg1_1 = permute = None
|
||||||
|
relu = torch.ops.aten.relu.default(addmm); addmm = None
|
||||||
|
permute_1 = torch.ops.aten.permute.default(arg4_1, [1, 0]); arg4_1 = None
|
||||||
|
addmm_1 = torch.ops.aten.addmm.default(arg5_1, relu, permute_1); arg5_1 = relu = permute_1 = None
|
||||||
|
return (getitem, addmm_1)""",
|
||||||
|
)
|
||||||
|
|
||||||
|
recorded_list.clear()
|
||||||
|
out2 = ep.module()(x)
|
||||||
|
self.assertEqual(len(recorded_list), 4)
|
||||||
|
self.assertTrue(torch.allclose(model(x)[0], out2[0]))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
run_tests()
|
run_tests()
|
||||||
|
|||||||
@ -7437,6 +7437,50 @@ class AOTInductorTestsTemplate:
|
|||||||
"RAIIAtenTensorHandle buf0(buf0_handle_restrided);"
|
"RAIIAtenTensorHandle buf0(buf0_handle_restrided);"
|
||||||
).run(code)
|
).run(code)
|
||||||
|
|
||||||
|
def test_codegen_int_array_var_fix_memory_leak(self):
|
||||||
|
"""
|
||||||
|
Fix https://github.com/pytorch/pytorch/issues/167630
|
||||||
|
"""
|
||||||
|
if self.device != "cuda":
|
||||||
|
raise unittest.SkipTest("test is only for cuda")
|
||||||
|
|
||||||
|
def make_mlp(in_dim=128, hidden=256, out_dim=64, depth=3):
|
||||||
|
layers = []
|
||||||
|
d = in_dim
|
||||||
|
for _ in range(depth):
|
||||||
|
layers += [nn.Linear(d, hidden), nn.ReLU()]
|
||||||
|
d = hidden
|
||||||
|
layers += [nn.Linear(d, out_dim)]
|
||||||
|
return nn.Sequential(*layers)
|
||||||
|
|
||||||
|
batch = 32
|
||||||
|
in_dim = 2048
|
||||||
|
hidden = 512
|
||||||
|
out_dim = 10
|
||||||
|
depth = 6
|
||||||
|
|
||||||
|
import gc
|
||||||
|
|
||||||
|
allocated_memory = []
|
||||||
|
for _ in range(3):
|
||||||
|
torch.cuda.reset_peak_memory_stats()
|
||||||
|
|
||||||
|
model = make_mlp(in_dim, hidden, out_dim, depth).to(self.device)
|
||||||
|
example_inputs = (torch.randn(batch, in_dim, device=self.device),)
|
||||||
|
ep = torch.export.export(
|
||||||
|
model,
|
||||||
|
example_inputs,
|
||||||
|
)
|
||||||
|
torch._inductor.aoti_compile_and_package(ep)
|
||||||
|
|
||||||
|
del model, example_inputs, ep
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
gc.collect()
|
||||||
|
allocated_memory.append(torch.cuda.memory_allocated())
|
||||||
|
|
||||||
|
self.assertTrue(allocated_memory[1] == allocated_memory[2])
|
||||||
|
|
||||||
@unittest.skipIf(IS_MACOS, "might have no readelf on Mac")
|
@unittest.skipIf(IS_MACOS, "might have no readelf on Mac")
|
||||||
def test_libtorch_free_so(self):
|
def test_libtorch_free_so(self):
|
||||||
class Model(torch.nn.Module):
|
class Model(torch.nn.Module):
|
||||||
|
|||||||
@ -828,9 +828,6 @@ inductor_one_sample["cuda"] = {
|
|||||||
"nn.functional.fractional_max_pool3d": {f16, f32, f64},
|
"nn.functional.fractional_max_pool3d": {f16, f32, f64},
|
||||||
"nn.functional.group_norm": {f16},
|
"nn.functional.group_norm": {f16},
|
||||||
"nn.functional.hinge_embedding_loss": {f16},
|
"nn.functional.hinge_embedding_loss": {f16},
|
||||||
# Enabling all tests for this test fails randomly
|
|
||||||
# See https://github.com/pytorch/pytorch/issues/129238
|
|
||||||
"nn.functional.huber_loss": {f16},
|
|
||||||
"nn.functional.interpolate.bicubic": {f16},
|
"nn.functional.interpolate.bicubic": {f16},
|
||||||
"nn.functional.interpolate.bilinear": {f16},
|
"nn.functional.interpolate.bilinear": {f16},
|
||||||
"nn.functional.interpolate.trilinear": {f16},
|
"nn.functional.interpolate.trilinear": {f16},
|
||||||
@ -948,9 +945,6 @@ inductor_one_sample["xpu"] = {
|
|||||||
"nn.functional.fractional_max_pool3d": {f16, f32, f64},
|
"nn.functional.fractional_max_pool3d": {f16, f32, f64},
|
||||||
"nn.functional.group_norm": {f16},
|
"nn.functional.group_norm": {f16},
|
||||||
"nn.functional.hinge_embedding_loss": {f16},
|
"nn.functional.hinge_embedding_loss": {f16},
|
||||||
# Enabling all tests for this test fails randomly
|
|
||||||
# See https://github.com/pytorch/pytorch/issues/129238
|
|
||||||
"nn.functional.huber_loss": {f16},
|
|
||||||
"nn.functional.interpolate.bicubic": {f16},
|
"nn.functional.interpolate.bicubic": {f16},
|
||||||
"nn.functional.interpolate.bilinear": {f16},
|
"nn.functional.interpolate.bilinear": {f16},
|
||||||
"nn.functional.interpolate.trilinear": {f16},
|
"nn.functional.interpolate.trilinear": {f16},
|
||||||
|
|||||||
@ -357,6 +357,9 @@ class TestFFT(TestCase):
|
|||||||
@unittest.skipIf(not TEST_NUMPY, 'NumPy not found')
|
@unittest.skipIf(not TEST_NUMPY, 'NumPy not found')
|
||||||
@ops([op for op in spectral_funcs if op.ndimensional == SpectralFuncType.ND],
|
@ops([op for op in spectral_funcs if op.ndimensional == SpectralFuncType.ND],
|
||||||
allowed_dtypes=(torch.cfloat, torch.cdouble))
|
allowed_dtypes=(torch.cfloat, torch.cdouble))
|
||||||
|
@toleranceOverride({
|
||||||
|
torch.cfloat : tol(2e-4, 1.3e-6),
|
||||||
|
})
|
||||||
def test_reference_nd(self, device, dtype, op):
|
def test_reference_nd(self, device, dtype, op):
|
||||||
if op.ref is None:
|
if op.ref is None:
|
||||||
raise unittest.SkipTest("No reference implementation")
|
raise unittest.SkipTest("No reference implementation")
|
||||||
|
|||||||
2
third_party/fbgemm
vendored
2
third_party/fbgemm
vendored
Submodule third_party/fbgemm updated: c0b988d39a...643894e701
@ -421,7 +421,7 @@ RESET_GRAD_ACCUMULATOR = {"set_", "resize_"}
|
|||||||
# inplace or out-variants)
|
# inplace or out-variants)
|
||||||
# If the function does not modify its arguments, we also check the following properties
|
# If the function does not modify its arguments, we also check the following properties
|
||||||
# pertaining to its output:
|
# pertaining to its output:
|
||||||
# 2) Its TensorImpl has use_count of 1
|
# 2) Its TensorImpl has use_count of 1 (or 2 if it has a PyObject)
|
||||||
# 3) If the function is a view function, it has the same StorageImpl as that of
|
# 3) If the function is a view function, it has the same StorageImpl as that of
|
||||||
# the input it is aliased with. Otherwise, its StorageImpl has use_count of 1
|
# the input it is aliased with. Otherwise, its StorageImpl has use_count of 1
|
||||||
#
|
#
|
||||||
@ -496,10 +496,10 @@ if (${tensor_name}_impl_saved && !at::impl::dispatch_mode_enabled() && !at::impl
|
|||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
|
|
||||||
ENFORCE_TENSOR_IMPL_USE_COUNT_LT_OR_EQ_ONE = CodeTemplate(
|
ENFORCE_TENSOR_IMPL_USE_COUNT = CodeTemplate(
|
||||||
"""\
|
"""\
|
||||||
if (!at::impl::dispatch_mode_enabled() && !at::impl::tensor_has_dispatch(${tensor_name}))
|
if (!at::impl::dispatch_mode_enabled() && !at::impl::tensor_has_dispatch(${tensor_name}))
|
||||||
TORCH_INTERNAL_ASSERT(${tensor_name}.use_count() <= 1, "function: ${fn_name}");
|
TORCH_INTERNAL_ASSERT(${tensor_name}.use_count() == expected_fresh_use_count(${tensor_name}), "function: ${fn_name}");
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1664,7 +1664,7 @@ def emit_body(
|
|||||||
|
|
||||||
if type_wrapper_name(f) not in DONT_ENFORCE_TENSOR_IMPL_USE_COUNT:
|
if type_wrapper_name(f) not in DONT_ENFORCE_TENSOR_IMPL_USE_COUNT:
|
||||||
stmts_after_call += [
|
stmts_after_call += [
|
||||||
ENFORCE_TENSOR_IMPL_USE_COUNT_LT_OR_EQ_ONE.substitute(
|
ENFORCE_TENSOR_IMPL_USE_COUNT.substitute(
|
||||||
tensor_name=ret_name, fn_name=type_wrapper_name(f)
|
tensor_name=ret_name, fn_name=type_wrapper_name(f)
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|||||||
@ -47,6 +47,18 @@ namespace{
|
|||||||
meta->grad_accumulator_.reset();
|
meta->grad_accumulator_.reset();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
[[maybe_unused]] size_t expected_fresh_use_count(const Variable& self) {
|
||||||
|
if (!self.defined()) {
|
||||||
|
// An UndefinedTensorImpl always has a use count of 0
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
if (self.unsafeGetTensorImpl()->pyobj_slot()->load_pyobj() != nullptr) {
|
||||||
|
// A TensorImpl with a Python object has a use count of 2
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
// A fresh TensorImpl (with no PyObject) has a use count of 1
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|||||||
@ -33,7 +33,7 @@ from .graph_capture_wrappers import (
|
|||||||
handle_effect_tokens_fn,
|
handle_effect_tokens_fn,
|
||||||
)
|
)
|
||||||
from .schemas import AOTConfig, FxValue, SubclassMeta, TraceFn, ViewAndMutationMeta
|
from .schemas import AOTConfig, FxValue, SubclassMeta, TraceFn, ViewAndMutationMeta
|
||||||
from .streams import assign_backward_streams
|
from .streams import assign_backward_streams, insert_backward_syncs
|
||||||
from .utils import (
|
from .utils import (
|
||||||
call_and_expect_output_descs,
|
call_and_expect_output_descs,
|
||||||
copy_fwd_metadata_to_bw_nodes,
|
copy_fwd_metadata_to_bw_nodes,
|
||||||
@ -477,6 +477,8 @@ def aot_dispatch_autograd_graph(
|
|||||||
# After copying metadata, assign streams to gradient accumulation nodes
|
# After copying metadata, assign streams to gradient accumulation nodes
|
||||||
assign_backward_streams(fx_g)
|
assign_backward_streams(fx_g)
|
||||||
|
|
||||||
|
insert_backward_syncs(fx_g)
|
||||||
|
|
||||||
fx_g.graph.eliminate_dead_code()
|
fx_g.graph.eliminate_dead_code()
|
||||||
if not aot_config.disable_functionalization:
|
if not aot_config.disable_functionalization:
|
||||||
# There should be *NO* mutating ops in the graph at this point.
|
# There should be *NO* mutating ops in the graph at this point.
|
||||||
|
|||||||
@ -3,6 +3,7 @@ from typing import Optional, TypeAlias
|
|||||||
import torch.fx
|
import torch.fx
|
||||||
import torch.fx.traceback
|
import torch.fx.traceback
|
||||||
from torch._dynamo.graph_utils import _get_flat_args
|
from torch._dynamo.graph_utils import _get_flat_args
|
||||||
|
from torch._dynamo.variables.streams import get_current_stream, new_event
|
||||||
|
|
||||||
|
|
||||||
Node: TypeAlias = torch.fx.Node
|
Node: TypeAlias = torch.fx.Node
|
||||||
@ -12,6 +13,14 @@ def is_gradient_acc(node: Node) -> bool:
|
|||||||
return node.meta.get("is_gradient_acc", False)
|
return node.meta.get("is_gradient_acc", False)
|
||||||
|
|
||||||
|
|
||||||
|
def is_bwd_node(node: Node) -> bool:
|
||||||
|
return node.meta.get("partitioner_tag") == "is_backward"
|
||||||
|
|
||||||
|
|
||||||
|
def get_device(node: Node) -> torch.device:
|
||||||
|
return node.meta["val"].device
|
||||||
|
|
||||||
|
|
||||||
def get_stream(node: Node) -> Optional[int]:
|
def get_stream(node: Node) -> Optional[int]:
|
||||||
maybe_annotation = node.meta.get("custom", None)
|
maybe_annotation = node.meta.get("custom", None)
|
||||||
if maybe_annotation is not None:
|
if maybe_annotation is not None:
|
||||||
@ -20,6 +29,13 @@ def get_stream(node: Node) -> Optional[int]:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_stream_or_current_stream(node: Node) -> int:
|
||||||
|
ind = get_stream(node)
|
||||||
|
if ind is None:
|
||||||
|
ind = get_current_stream(get_device(node))
|
||||||
|
return ind
|
||||||
|
|
||||||
|
|
||||||
def set_stream(node: Node, ind: int) -> None:
|
def set_stream(node: Node, ind: int) -> None:
|
||||||
if "custom" in node.meta:
|
if "custom" in node.meta:
|
||||||
node.meta["custom"].update({"stream": ind})
|
node.meta["custom"].update({"stream": ind})
|
||||||
@ -27,6 +43,36 @@ def set_stream(node: Node, ind: int) -> None:
|
|||||||
node.meta["custom"] = {"stream": ind}
|
node.meta["custom"] = {"stream": ind}
|
||||||
|
|
||||||
|
|
||||||
|
def insert_sync(
|
||||||
|
graph: torch.fx.Graph,
|
||||||
|
consumer: Node,
|
||||||
|
producer: Node,
|
||||||
|
node_to_wait_event_ind: dict[Node, int],
|
||||||
|
) -> None:
|
||||||
|
if producer not in node_to_wait_event_ind:
|
||||||
|
node_to_wait_event_ind[producer] = new_event()
|
||||||
|
|
||||||
|
with graph.inserting_after(producer):
|
||||||
|
node = graph.call_function(
|
||||||
|
torch.ops.streams.record_event.default,
|
||||||
|
(
|
||||||
|
node_to_wait_event_ind[producer],
|
||||||
|
get_stream_or_current_stream(producer),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
node.meta["partitioner_tag"] = "must_be_in_backward"
|
||||||
|
|
||||||
|
with graph.inserting_before(consumer):
|
||||||
|
node = graph.call_function(
|
||||||
|
torch.ops.streams.wait_event.default,
|
||||||
|
(
|
||||||
|
node_to_wait_event_ind[producer],
|
||||||
|
get_stream_or_current_stream(consumer),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
node.meta["partitioner_tag"] = "must_be_in_backward"
|
||||||
|
|
||||||
|
|
||||||
def assign_backward_streams(gm: torch.fx.GraphModule) -> None:
|
def assign_backward_streams(gm: torch.fx.GraphModule) -> None:
|
||||||
"""Assigns backward streams to gradient accumulation nodes"""
|
"""Assigns backward streams to gradient accumulation nodes"""
|
||||||
|
|
||||||
@ -51,3 +97,18 @@ def assign_backward_streams(gm: torch.fx.GraphModule) -> None:
|
|||||||
if ind is not None:
|
if ind is not None:
|
||||||
set_stream(node, ind)
|
set_stream(node, ind)
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
|
def insert_backward_syncs(gm: torch.fx.GraphModule) -> None:
|
||||||
|
"""Inserts stream syncs for backward nodes if consumer and producer are on different streams"""
|
||||||
|
node_to_wait_event_ind = {}
|
||||||
|
for node in gm.graph.nodes:
|
||||||
|
if is_bwd_node(node):
|
||||||
|
flat_args = _get_flat_args(node, {})
|
||||||
|
cur_node_stream = get_stream(node)
|
||||||
|
|
||||||
|
for arg in flat_args:
|
||||||
|
if is_bwd_node(arg):
|
||||||
|
arg_stream = get_stream(arg)
|
||||||
|
if arg_stream != cur_node_stream and get_device(arg).type != "cpu":
|
||||||
|
insert_sync(gm.graph, node, arg, node_to_wait_event_ind)
|
||||||
|
|||||||
@ -713,6 +713,9 @@ class InvokeSubgraphCache(HopSubgraphCache):
|
|||||||
self.lazy_bwd_cache: dict[
|
self.lazy_bwd_cache: dict[
|
||||||
str, dict[tuple[object], tuple[torch.fx.GraphModule, int]]
|
str, dict[tuple[object], tuple[torch.fx.GraphModule, int]]
|
||||||
] = defaultdict(dict)
|
] = defaultdict(dict)
|
||||||
|
self.effects_cache: dict[
|
||||||
|
str, set
|
||||||
|
] = {} # Maps identifier -> set of effect types
|
||||||
|
|
||||||
def add_dynamo_installed_submodule(self, fn_id: int, identifier: str) -> None:
|
def add_dynamo_installed_submodule(self, fn_id: int, identifier: str) -> None:
|
||||||
self.dynamo_installed_submodules[fn_id].append(identifier)
|
self.dynamo_installed_submodules[fn_id].append(identifier)
|
||||||
@ -751,6 +754,21 @@ class InvokeSubgraphCache(HopSubgraphCache):
|
|||||||
|
|
||||||
return self.lazy_bwd_cache[identifier].get(tangent_metadata, (None, None))
|
return self.lazy_bwd_cache[identifier].get(tangent_metadata, (None, None))
|
||||||
|
|
||||||
|
def add_effects(self, identifier: str, effects: set) -> None:
|
||||||
|
"""Store the effect types for a given invoke_subgraph identifier."""
|
||||||
|
if prev_effects := self.effects_cache.get(identifier, None):
|
||||||
|
assert effects == prev_effects, (
|
||||||
|
"Different number of effects were found for invoke_subgraph "
|
||||||
|
f"call with identifier {identifier}. \n"
|
||||||
|
f"Previously we had the following effects: {prev_effects}.\n"
|
||||||
|
f"But now we have: {effects}."
|
||||||
|
)
|
||||||
|
self.effects_cache[identifier] = effects
|
||||||
|
|
||||||
|
def get_effects(self, identifier: str) -> Optional[set]:
|
||||||
|
"""Retrieve the effect types for a given invoke_subgraph identifier."""
|
||||||
|
return self.effects_cache.get(identifier, None)
|
||||||
|
|
||||||
|
|
||||||
class HopDispatchSetCache:
|
class HopDispatchSetCache:
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
|
|||||||
@ -80,6 +80,7 @@ class InvokeSubgraphHOP(HigherOrderOperator):
|
|||||||
assert all(
|
assert all(
|
||||||
isinstance(o, (torch.Tensor, int, torch.SymInt, torch.Generator))
|
isinstance(o, (torch.Tensor, int, torch.SymInt, torch.Generator))
|
||||||
for o in operands
|
for o in operands
|
||||||
|
if o is not None
|
||||||
), (
|
), (
|
||||||
f"invoke_subgraph operands must be a list of tensors/ints/SymInts/Generator {operands}"
|
f"invoke_subgraph operands must be a list of tensors/ints/SymInts/Generator {operands}"
|
||||||
)
|
)
|
||||||
@ -304,6 +305,62 @@ def create_fw_bw_graph(subgraph, operands, grad_outputs=None):
|
|||||||
|
|
||||||
|
|
||||||
def get_output_metadata(subgraph, *operands):
|
def get_output_metadata(subgraph, *operands):
|
||||||
|
"""
|
||||||
|
Extract metadata about the subgraph outputs WITHOUT executing the subgraph.
|
||||||
|
This avoids running side-effectful operations twice (once here, once in forward).
|
||||||
|
We analyze the graph structure statically to extract metadata.
|
||||||
|
"""
|
||||||
|
# Unwrap FunctionalizeCtxWrapper if present
|
||||||
|
if isinstance(subgraph, FunctionalizeCtxWrapper):
|
||||||
|
subgraph = subgraph.subgraph
|
||||||
|
|
||||||
|
# If not a GraphModule, fall back to execution-based metadata extraction
|
||||||
|
if not isinstance(subgraph, torch.fx.GraphModule):
|
||||||
|
return _get_output_metadata_by_execution(subgraph, *operands)
|
||||||
|
|
||||||
|
output_metadata = OutputMetadata()
|
||||||
|
|
||||||
|
# Extract output arguments from the output node
|
||||||
|
# The output node has args=(output_values,) where output_values is a tuple/list
|
||||||
|
output_node = next(reversed(subgraph.graph.find_nodes(op="output")))
|
||||||
|
output_metadata.num_fw_outs = len(output_node.args[0])
|
||||||
|
|
||||||
|
for idx, output_arg in enumerate(output_node.args[0]):
|
||||||
|
if not isinstance(output_arg, torch.fx.Node):
|
||||||
|
if isinstance(output_arg, int):
|
||||||
|
output_metadata.indexes_with_symint.add(idx)
|
||||||
|
output_metadata.indexes_with_no_grad.add(idx)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check node metadata for type information
|
||||||
|
if output_arg.meta.get("val") is None:
|
||||||
|
# If we don't have complete metadata for all outputs, fall back to execution
|
||||||
|
# This is important for correctness (e.g., detecting SymInts) even though it
|
||||||
|
# runs side-effectful operations
|
||||||
|
return _get_output_metadata_by_execution(subgraph, *operands)
|
||||||
|
|
||||||
|
val = output_arg.meta["val"]
|
||||||
|
if isinstance(val, torch.SymInt):
|
||||||
|
output_metadata.indexes_with_symint.add(idx)
|
||||||
|
output_metadata.indexes_with_no_grad.add(idx)
|
||||||
|
elif isinstance(val, torch.Tensor):
|
||||||
|
# Check if tensor requires grad from metadata
|
||||||
|
if hasattr(val, "requires_grad") and not val.requires_grad:
|
||||||
|
output_metadata.indexes_with_no_grad.add(idx)
|
||||||
|
else:
|
||||||
|
# Non-tensor, non-symint (shouldn't happen but be safe)
|
||||||
|
output_metadata.indexes_with_no_grad.add(idx)
|
||||||
|
|
||||||
|
return output_metadata
|
||||||
|
|
||||||
|
|
||||||
|
def _get_output_metadata_by_execution(subgraph, *operands):
|
||||||
|
"""
|
||||||
|
Fallback: Extract metadata by executing the subgraph.
|
||||||
|
This should only be used when static analysis fails.
|
||||||
|
WARNING: This will run side-effectful operations!
|
||||||
|
"""
|
||||||
|
|
||||||
with suspend_functionalization(), disable_functional_mode():
|
with suspend_functionalization(), disable_functional_mode():
|
||||||
with disable_proxy_modes_tracing():
|
with disable_proxy_modes_tracing():
|
||||||
# args are functional tensors, generate some example tensors
|
# args are functional tensors, generate some example tensors
|
||||||
@ -323,19 +380,15 @@ def get_output_metadata(subgraph, *operands):
|
|||||||
|
|
||||||
num_fw_outs = len(fw_outs)
|
num_fw_outs = len(fw_outs)
|
||||||
|
|
||||||
# Collect the indexes of none in the output to check that the grad
|
|
||||||
# is None at the corresponding index in the backward. This check is
|
|
||||||
# performed in the autograd.Function - InvokeSubgraphAutogradOp.
|
|
||||||
# Also collect the indexes of no_grad in the output to filter out
|
|
||||||
# the grad_outs in the `backward` method.
|
|
||||||
output_metadata = OutputMetadata()
|
output_metadata = OutputMetadata()
|
||||||
|
|
||||||
output_metadata.num_fw_outs = num_fw_outs
|
output_metadata.num_fw_outs = num_fw_outs
|
||||||
|
|
||||||
for idx, fw_out in enumerate(fw_outs):
|
for idx, fw_out in enumerate(fw_outs):
|
||||||
if isinstance(fw_out, torch.SymInt):
|
if isinstance(fw_out, torch.SymInt):
|
||||||
output_metadata.indexes_with_symint.add(idx)
|
output_metadata.indexes_with_symint.add(idx)
|
||||||
elif not fw_out.requires_grad:
|
elif not fw_out.requires_grad:
|
||||||
output_metadata.indexes_with_no_grad.add(idx)
|
output_metadata.indexes_with_no_grad.add(idx)
|
||||||
|
|
||||||
return output_metadata
|
return output_metadata
|
||||||
|
|
||||||
|
|
||||||
@ -562,7 +615,34 @@ def _(ctx, subgraph, identifier, *operands):
|
|||||||
do_auto_functionalize_v2,
|
do_auto_functionalize_v2,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# (in the functionalization metadata phase) Capture tokens before
|
||||||
|
tokens_before = dict(ctx.mode._tokens)
|
||||||
|
|
||||||
|
# Check if this subgraph has effects stored in the cache
|
||||||
|
invoke_subgraph_cache = get_invoke_subgraph_cache()
|
||||||
|
effects = None
|
||||||
|
if invoke_subgraph_cache:
|
||||||
|
effects = invoke_subgraph_cache.get_effects(identifier)
|
||||||
|
|
||||||
|
if effects:
|
||||||
|
assert len(effects) == 1, "Multiple effects within a subgraph NYI"
|
||||||
|
tokens = ctx.mode._tokens
|
||||||
|
effects = next(iter(effects))
|
||||||
|
token_input = tokens[effects]
|
||||||
|
|
||||||
|
operands = (token_input, *operands)
|
||||||
|
|
||||||
|
def wrap_subgraph(subgraph):
|
||||||
|
def wrapped_subgraph(token, *args):
|
||||||
|
res = subgraph(*args)
|
||||||
|
return ctx.unwrap_tensors(ctx.mode._tokens[effects]), *res
|
||||||
|
|
||||||
|
return wrapped_subgraph
|
||||||
|
|
||||||
|
subgraph = wrap_subgraph(subgraph)
|
||||||
|
|
||||||
unwrapped_operands = ctx.unwrap_tensors(operands)
|
unwrapped_operands = ctx.unwrap_tensors(operands)
|
||||||
|
|
||||||
hop_instance = HopInstance.create(invoke_subgraph, subgraph, identifier, *operands)
|
hop_instance = HopInstance.create(invoke_subgraph, subgraph, identifier, *operands)
|
||||||
if can_auto_functionalize(hop_instance):
|
if can_auto_functionalize(hop_instance):
|
||||||
# NOTE: [auto_functionalize x invoke_subgraph caching]
|
# NOTE: [auto_functionalize x invoke_subgraph caching]
|
||||||
@ -587,6 +667,28 @@ def _(ctx, subgraph, identifier, *operands):
|
|||||||
# of invoke_subgraph ops if input aliasing/mutation is detected.
|
# of invoke_subgraph ops if input aliasing/mutation is detected.
|
||||||
functionalized_subgraph = FunctionalizeCtxWrapper(ctx, subgraph)
|
functionalized_subgraph = FunctionalizeCtxWrapper(ctx, subgraph)
|
||||||
out = invoke_subgraph(functionalized_subgraph, identifier, *unwrapped_operands)
|
out = invoke_subgraph(functionalized_subgraph, identifier, *unwrapped_operands)
|
||||||
|
|
||||||
|
if effects:
|
||||||
|
(new_token, *out) = out
|
||||||
|
ctx.mode._tokens[effects] = new_token
|
||||||
|
|
||||||
|
# (in the functionalization metadata phase) Capture tokens after and see if
|
||||||
|
# there are any differences (there are new effects or the token value for an
|
||||||
|
# effect type has changed)
|
||||||
|
tokens_after = dict(ctx.mode._tokens)
|
||||||
|
discovered_effects = set()
|
||||||
|
for effect_type, token in tokens_after.items():
|
||||||
|
if effect_type not in tokens_before or tokens_before[effect_type] is not token:
|
||||||
|
discovered_effects.add(effect_type)
|
||||||
|
|
||||||
|
if discovered_effects:
|
||||||
|
assert ctx.mode._allow_token_discovery, (
|
||||||
|
f"Number of tokens changed by {len(discovered_effects)} when tracing subgraph {subgraph}."
|
||||||
|
)
|
||||||
|
# Store discovered effects in the cache by identifier
|
||||||
|
if invoke_subgraph_cache:
|
||||||
|
invoke_subgraph_cache.add_effects(identifier, discovered_effects)
|
||||||
|
|
||||||
return ctx.wrap_tensors(out)
|
return ctx.wrap_tensors(out)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -96,6 +96,7 @@ class CppWrapperCpu(PythonWrapperCodegen):
|
|||||||
self.include_extra_header = functools.lru_cache(None)( # type: ignore[method-assign]
|
self.include_extra_header = functools.lru_cache(None)( # type: ignore[method-assign]
|
||||||
self._include_extra_header
|
self._include_extra_header
|
||||||
)
|
)
|
||||||
|
self.codegen_int_array_var_cache = {}
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def create(
|
def create(
|
||||||
@ -1636,14 +1637,33 @@ class CppWrapperCpu(PythonWrapperCodegen):
|
|||||||
self.used_cached_memory_formats.add(memory_format_str)
|
self.used_cached_memory_formats.add(memory_format_str)
|
||||||
return f"cached_torch_memory_format_{memory_format_str}"
|
return f"cached_torch_memory_format_{memory_format_str}"
|
||||||
|
|
||||||
@functools.cache # noqa: B019
|
|
||||||
def codegen_int_array_var(
|
def codegen_int_array_var(
|
||||||
self,
|
self,
|
||||||
int_array: str,
|
int_array: str,
|
||||||
writeline: Callable[..., None],
|
writeline: Callable[..., None],
|
||||||
known_statically=False,
|
known_statically=False,
|
||||||
graph=None, # for per-graph caching
|
graph=None, # for per-graph caching
|
||||||
):
|
) -> str:
|
||||||
|
# Use id(graph) for caching to avoid circular references
|
||||||
|
cache_key = (
|
||||||
|
int_array,
|
||||||
|
id(writeline),
|
||||||
|
known_statically,
|
||||||
|
id(graph) if graph else None,
|
||||||
|
)
|
||||||
|
if cache_key not in self.codegen_int_array_var_cache:
|
||||||
|
self.codegen_int_array_var_cache[cache_key] = (
|
||||||
|
self._codegen_int_array_var_impl(int_array, writeline, known_statically)
|
||||||
|
)
|
||||||
|
|
||||||
|
return self.codegen_int_array_var_cache[cache_key]
|
||||||
|
|
||||||
|
def _codegen_int_array_var_impl(
|
||||||
|
self,
|
||||||
|
int_array: str,
|
||||||
|
writeline: Callable[..., None],
|
||||||
|
known_statically: bool,
|
||||||
|
) -> str:
|
||||||
# Used for size/stride declaration
|
# Used for size/stride declaration
|
||||||
#
|
#
|
||||||
# Because the memory planning is done in two passes (see the implementation
|
# Because the memory planning is done in two passes (see the implementation
|
||||||
|
|||||||
@ -35,6 +35,18 @@ class EffectHolder:
|
|||||||
if namespace == "higher_order":
|
if namespace == "higher_order":
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# These classes do not have side effects as they just store quantization
|
||||||
|
# params, so we dont need to mark them as ordered
|
||||||
|
skip_classes = (
|
||||||
|
"__torch__.torch.classes.quantized.Conv2dPackedParamsBase",
|
||||||
|
"__torch__.torch.classes.quantized.Conv3dPackedParamsBase",
|
||||||
|
"__torch__.torch.classes.quantized.EmbeddingPackedParamsBase",
|
||||||
|
"__torch__.torch.classes.quantized.LinearPackedParamsBase",
|
||||||
|
"__torch__.torch.classes.xnnpack.Conv2dOpContext",
|
||||||
|
"__torch__.torch.classes.xnnpack.LinearOpContext",
|
||||||
|
"__torch__.torch.classes.xnnpack.TransposeConv2dOpContext",
|
||||||
|
)
|
||||||
|
|
||||||
opname = f"{namespace}::{opname}"
|
opname = f"{namespace}::{opname}"
|
||||||
if torch._C._get_operation_overload(opname, overload) is not None:
|
if torch._C._get_operation_overload(opname, overload) is not None:
|
||||||
# Since we call this when destroying the library, sometimes the
|
# Since we call this when destroying the library, sometimes the
|
||||||
@ -42,6 +54,9 @@ class EffectHolder:
|
|||||||
schema = torch._C._get_schema(opname, overload)
|
schema = torch._C._get_schema(opname, overload)
|
||||||
for arg in schema.arguments:
|
for arg in schema.arguments:
|
||||||
if isinstance(arg.type, torch.ClassType):
|
if isinstance(arg.type, torch.ClassType):
|
||||||
|
type_str = arg.type.str() # pyrefly: ignore[missing-attribute]
|
||||||
|
if type_str in skip_classes:
|
||||||
|
continue
|
||||||
self._effect = EffectType.ORDERED
|
self._effect = EffectType.ORDERED
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|||||||
@ -138,7 +138,7 @@ inline void PyErr_SetString(PyObject* type, const std::string& message) {
|
|||||||
throw; \
|
throw; \
|
||||||
} \
|
} \
|
||||||
} \
|
} \
|
||||||
catch (const std::exception& e) { \
|
catch (const std::exception&) { \
|
||||||
torch::translate_exception_to_python(std::current_exception()); \
|
torch::translate_exception_to_python(std::current_exception()); \
|
||||||
return retval; \
|
return retval; \
|
||||||
}
|
}
|
||||||
|
|||||||
@ -390,31 +390,27 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
|
|||||||
m.def("_supported_activities", []() {
|
m.def("_supported_activities", []() {
|
||||||
std::set<torch::profiler::impl::ActivityType> activities{
|
std::set<torch::profiler::impl::ActivityType> activities{
|
||||||
torch::profiler::impl::ActivityType::CPU};
|
torch::profiler::impl::ActivityType::CPU};
|
||||||
#if defined(USE_KINETO) && \
|
#if defined(USE_KINETO)
|
||||||
(!defined(LIBKINETO_NOCUPTI) || !defined(LIBKINETO_NOROCTRACER))
|
#if (!defined(LIBKINETO_NOCUPTI) || !defined(LIBKINETO_NOROCTRACER))
|
||||||
if (at::hasMTIA()) {
|
|
||||||
activities.insert(torch::profiler::impl::ActivityType::MTIA);
|
|
||||||
}
|
|
||||||
if (at::hasHPU()) {
|
|
||||||
activities.insert(torch::profiler::impl::ActivityType::HPU);
|
|
||||||
}
|
|
||||||
if (at::getNumGPUs() > 0) {
|
if (at::getNumGPUs() > 0) {
|
||||||
activities.insert(torch::profiler::impl::ActivityType::CUDA);
|
activities.insert(torch::profiler::impl::ActivityType::CUDA);
|
||||||
}
|
}
|
||||||
#elif defined(USE_KINETO)
|
#endif // (!defined(LIBKINETO_NOCUPTI) || !defined(LIBKINETO_NOROCTRACER))
|
||||||
|
#if (!defined(LIBKINETO_NOXPUPTI))
|
||||||
if (at::hasXPU()) {
|
if (at::hasXPU()) {
|
||||||
activities.insert(torch::profiler::impl::ActivityType::XPU);
|
activities.insert(torch::profiler::impl::ActivityType::XPU);
|
||||||
}
|
}
|
||||||
if (at::hasHPU()) {
|
#endif // (!defined(LIBKINETO_NOXPUPTI))
|
||||||
activities.insert(torch::profiler::impl::ActivityType::HPU);
|
|
||||||
}
|
|
||||||
if (at::hasMTIA()) {
|
if (at::hasMTIA()) {
|
||||||
activities.insert(torch::profiler::impl::ActivityType::MTIA);
|
activities.insert(torch::profiler::impl::ActivityType::MTIA);
|
||||||
}
|
}
|
||||||
|
if (at::hasHPU()) {
|
||||||
|
activities.insert(torch::profiler::impl::ActivityType::HPU);
|
||||||
|
}
|
||||||
if (c10::get_privateuse1_backend() != "privateuseone") {
|
if (c10::get_privateuse1_backend() != "privateuseone") {
|
||||||
activities.insert(torch::profiler::impl::ActivityType::PrivateUse1);
|
activities.insert(torch::profiler::impl::ActivityType::PrivateUse1);
|
||||||
}
|
}
|
||||||
#endif
|
#endif // defined(USE_KINETO)
|
||||||
return activities;
|
return activities;
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@ -1200,25 +1200,27 @@ get_thread_local_native_sharding_propagator_cache() {
|
|||||||
py::reinterpret_borrow<py::dict>(PyThreadState_GetDict());
|
py::reinterpret_borrow<py::dict>(PyThreadState_GetDict());
|
||||||
// We need to clean up before Python detaches from the thread if
|
// We need to clean up before Python detaches from the thread if
|
||||||
// the thread is being destroyed.
|
// the thread is being destroyed.
|
||||||
thread_dict["__DTensor_fastpath_thread_cache_cleanup"] =
|
if (!thread_dict.contains("__DTensor_fastpath_thread_cache_cleanup")) {
|
||||||
py::capsule(new std::thread::id(this_thread_id), [](void* p) {
|
thread_dict["__DTensor_fastpath_thread_cache_cleanup"] =
|
||||||
auto* ptid = reinterpret_cast<std::thread::id*>(p);
|
py::capsule(new std::thread::id(this_thread_id), [](void* p) {
|
||||||
{
|
auto* ptid = reinterpret_cast<std::thread::id*>(p);
|
||||||
std::lock_guard<std::mutex> inner_lock(
|
{
|
||||||
native_sharding_propagator_cache_cleanup_mutex);
|
std::lock_guard<std::mutex> inner_lock(
|
||||||
auto it = all_thread_caches.find(*ptid);
|
native_sharding_propagator_cache_cleanup_mutex);
|
||||||
if (it != all_thread_caches.end()) {
|
auto it = all_thread_caches.find(*ptid);
|
||||||
// We need to both:
|
if (it != all_thread_caches.end()) {
|
||||||
// 1) free python objects, and
|
// We need to both:
|
||||||
it->second->reset();
|
// 1) free python objects, and
|
||||||
// 2) make sure we don't try to come back and mess with
|
it->second->reset();
|
||||||
// a destroyed thread-local at module unload (e.g.,
|
// 2) make sure we don't try to come back and mess with
|
||||||
// process exit) time.
|
// a destroyed thread-local at module unload (e.g.,
|
||||||
all_thread_caches.erase(it);
|
// process exit) time.
|
||||||
|
all_thread_caches.erase(it);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
delete ptid;
|
||||||
delete ptid;
|
});
|
||||||
});
|
}
|
||||||
}
|
}
|
||||||
return native_sharding_propagator_cache_DO_NOT_USE.value();
|
return native_sharding_propagator_cache_DO_NOT_USE.value();
|
||||||
}
|
}
|
||||||
|
|||||||
@ -81,7 +81,7 @@ c10::intrusive_ptr<Backend> ProcessGroup::getBackend(
|
|||||||
ProcessGroup::BackendType backendType{ProcessGroup::BackendType::UNDEFINED};
|
ProcessGroup::BackendType backendType{ProcessGroup::BackendType::UNDEFINED};
|
||||||
try {
|
try {
|
||||||
backendType = deviceTypeToBackendType_.at(deviceType);
|
backendType = deviceTypeToBackendType_.at(deviceType);
|
||||||
} catch (const std::out_of_range& e) {
|
} catch (const std::out_of_range&) {
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
false, "No backend type associated with device type ", deviceType);
|
false, "No backend type associated with device type ", deviceType);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -246,7 +246,7 @@ class UvTcpServer : public UvTcpSocket {
|
|||||||
uv_err_name(uv_res),
|
uv_err_name(uv_res),
|
||||||
uv_strerror(uv_res)));
|
uv_strerror(uv_res)));
|
||||||
res->cacheSocketPort();
|
res->cacheSocketPort();
|
||||||
} catch (std::exception& ex) {
|
} catch (std::exception&) {
|
||||||
res->close();
|
res->close();
|
||||||
throw;
|
throw;
|
||||||
}
|
}
|
||||||
@ -322,7 +322,7 @@ class UvTcpServer : public UvTcpSocket {
|
|||||||
uv_err_name(uv_res),
|
uv_err_name(uv_res),
|
||||||
uv_strerror(uv_res)));
|
uv_strerror(uv_res)));
|
||||||
res->cacheSocketPort();
|
res->cacheSocketPort();
|
||||||
} catch (std::exception& ex) {
|
} catch (std::exception&) {
|
||||||
res->close();
|
res->close();
|
||||||
throw;
|
throw;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -353,7 +353,7 @@ static PyObject* NodeBase__update_args_kwargs(
|
|||||||
Py_CLEAR(node->_kwargs);
|
Py_CLEAR(node->_kwargs);
|
||||||
node->_kwargs = map_aggregate(args[1], visit_fn);
|
node->_kwargs = map_aggregate(args[1], visit_fn);
|
||||||
Py_RETURN_NONE;
|
Py_RETURN_NONE;
|
||||||
} catch (const PythonError& e) {
|
} catch (const PythonError&) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -397,7 +397,7 @@ static PyObject* NodeBase__replace_input_with(
|
|||||||
|
|
||||||
PyObject* update_args[2] = {new_args.get(), new_kwargs.get()};
|
PyObject* update_args[2] = {new_args.get(), new_kwargs.get()};
|
||||||
return NodeBase__update_args_kwargs(self, update_args, 2);
|
return NodeBase__update_args_kwargs(self, update_args, 2);
|
||||||
} catch (const PythonError& e) {
|
} catch (const PythonError&) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -802,7 +802,7 @@ static PyObject* py_map_aggregate(
|
|||||||
// args[0]: aggregate, args[1]: callable fn
|
// args[0]: aggregate, args[1]: callable fn
|
||||||
return map_aggregate(
|
return map_aggregate(
|
||||||
args[0], [fn](PyObject* a) { return PyObject_CallOneArg(fn, a); });
|
args[0], [fn](PyObject* a) { return PyObject_CallOneArg(fn, a); });
|
||||||
} catch (const PythonError& e) {
|
} catch (const PythonError&) {
|
||||||
return nullptr; // error should already be set
|
return nullptr; // error should already be set
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -824,7 +824,7 @@ static PyObject* py_map_arg(
|
|||||||
}
|
}
|
||||||
return Py_NewRef(a);
|
return Py_NewRef(a);
|
||||||
});
|
});
|
||||||
} catch (const PythonError& e) {
|
} catch (const PythonError&) {
|
||||||
return nullptr; // error should already be set
|
return nullptr; // error should already be set
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -117,7 +117,7 @@ struct type_caster<torch::jit::IValue> {
|
|||||||
try {
|
try {
|
||||||
value = torch::jit::toTypeInferredIValue(src);
|
value = torch::jit::toTypeInferredIValue(src);
|
||||||
return true;
|
return true;
|
||||||
} catch (std::exception& e) {
|
} catch (std::exception&) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -142,7 +142,7 @@ struct type_caster<torch::jit::Symbol> {
|
|||||||
std::string src_str;
|
std::string src_str;
|
||||||
try {
|
try {
|
||||||
src_str = py::cast<std::string>(src);
|
src_str = py::cast<std::string>(src);
|
||||||
} catch (std::exception& e) {
|
} catch (std::exception&) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
value = torch::jit::Symbol::fromQualString(src_str);
|
value = torch::jit::Symbol::fromQualString(src_str);
|
||||||
|
|||||||
@ -281,11 +281,11 @@ struct FromImpl<torch::headeronly::HeaderOnlyArrayRef<T>> {
|
|||||||
TORCH_ERROR_CODE_CHECK(
|
TORCH_ERROR_CODE_CHECK(
|
||||||
torch_new_list_reserve_size(val.size(), &new_list_handle));
|
torch_new_list_reserve_size(val.size(), &new_list_handle));
|
||||||
for (const auto& elem : val) {
|
for (const auto& elem : val) {
|
||||||
TORCH_ERROR_CODE_CHECK(
|
TORCH_ERROR_CODE_CHECK(torch_list_push_back(
|
||||||
torch_list_push_back(new_list_handle, from(elem)));
|
new_list_handle, torch::stable::detail::from(elem)));
|
||||||
}
|
}
|
||||||
return from(new_list_handle);
|
return torch::stable::detail::from(new_list_handle);
|
||||||
} catch (const std::runtime_error& e) {
|
} catch (const std::runtime_error&) {
|
||||||
if (new_list_handle != nullptr) {
|
if (new_list_handle != nullptr) {
|
||||||
// clean up memory if an error was thrown
|
// clean up memory if an error was thrown
|
||||||
TORCH_ERROR_CODE_CHECK(torch_delete_list(new_list_handle));
|
TORCH_ERROR_CODE_CHECK(torch_delete_list(new_list_handle));
|
||||||
@ -553,7 +553,7 @@ struct ToImpl<std::vector<T>> {
|
|||||||
}
|
}
|
||||||
TORCH_ERROR_CODE_CHECK(torch_delete_list(list_handle));
|
TORCH_ERROR_CODE_CHECK(torch_delete_list(list_handle));
|
||||||
return result;
|
return result;
|
||||||
} catch (const std::runtime_error& e) {
|
} catch (const std::runtime_error&) {
|
||||||
// clean up memory if an exception is thrown, and rethrow
|
// clean up memory if an exception is thrown, and rethrow
|
||||||
TORCH_ERROR_CODE_CHECK(torch_delete_list(list_handle));
|
TORCH_ERROR_CODE_CHECK(torch_delete_list(list_handle));
|
||||||
throw;
|
throw;
|
||||||
|
|||||||
@ -76,7 +76,11 @@ from torch.fx.experimental._constant_symnode import ConstantIntNode
|
|||||||
from torch.nested._internal.nested_int import NestedIntNode
|
from torch.nested._internal.nested_int import NestedIntNode
|
||||||
from torch.utils import _pytree as pytree
|
from torch.utils import _pytree as pytree
|
||||||
from torch.utils._mode_utils import no_dispatch
|
from torch.utils._mode_utils import no_dispatch
|
||||||
from torch.utils._python_dispatch import return_and_correct_aliasing, TorchDispatchMode
|
from torch.utils._python_dispatch import (
|
||||||
|
_get_current_dispatch_mode_stack,
|
||||||
|
return_and_correct_aliasing,
|
||||||
|
TorchDispatchMode,
|
||||||
|
)
|
||||||
from torch.utils.checkpoint import get_device_states, set_device_states
|
from torch.utils.checkpoint import get_device_states, set_device_states
|
||||||
|
|
||||||
|
|
||||||
@ -86,6 +90,12 @@ not_implemented_log = torch._logging.getArtifactLogger(__name__, "not_implemente
|
|||||||
from . import _c10d
|
from . import _c10d
|
||||||
|
|
||||||
|
|
||||||
|
def _is_in_fake_tensor_mode() -> bool:
|
||||||
|
return any(
|
||||||
|
isinstance(mode, FakeTensorMode) for mode in _get_current_dispatch_mode_stack()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _is_inplace_op(op: OpOverload | Callable[..., Any]) -> bool:
|
def _is_inplace_op(op: OpOverload | Callable[..., Any]) -> bool:
|
||||||
return (
|
return (
|
||||||
isinstance(op, OpOverload)
|
isinstance(op, OpOverload)
|
||||||
@ -256,21 +266,31 @@ def _for_each_rank_run_func(
|
|||||||
a.wait() if isinstance(a, AsyncCollectiveTensor) else a for a in flat_args
|
a.wait() if isinstance(a, AsyncCollectiveTensor) else a for a in flat_args
|
||||||
]
|
]
|
||||||
|
|
||||||
# NB: Before invoking an op we are collecting rng states from CPU and
|
lm = enabled_local_tensor_mode()
|
||||||
# CUDA devices such that we can reset to the same before invoking op
|
use_per_rank_rng = lm is not None and len(lm._per_rank_rng_states) > 0
|
||||||
# for each rank. This is not very efficient and will likely be revisited
|
|
||||||
# to support per rank rng state.
|
global_rng_state = None if use_per_rank_rng else _get_rng_state()
|
||||||
rng_state = _get_rng_state()
|
|
||||||
flat_rank_rets = {}
|
flat_rank_rets = {}
|
||||||
|
|
||||||
default_value: Tensor | None = None
|
default_value: Tensor | None = None
|
||||||
for r in sorted(ranks):
|
for r in sorted(ranks):
|
||||||
_set_rng_state(*rng_state)
|
if use_per_rank_rng:
|
||||||
|
assert lm is not None
|
||||||
|
_set_rng_state(*lm._per_rank_rng_states[r])
|
||||||
|
else:
|
||||||
|
assert global_rng_state is not None
|
||||||
|
_set_rng_state(*global_rng_state)
|
||||||
|
|
||||||
rank_flat_args = [_map_to_rank_local_val(a, r) for a in flat_args]
|
rank_flat_args = [_map_to_rank_local_val(a, r) for a in flat_args]
|
||||||
rank_args, rank_kwargs = pytree.tree_unflatten(rank_flat_args, args_spec)
|
rank_args, rank_kwargs = pytree.tree_unflatten(rank_flat_args, args_spec)
|
||||||
rank_ret = func(*rank_args, **rank_kwargs)
|
rank_ret = func(*rank_args, **rank_kwargs)
|
||||||
flat_rank_rets[r] = rank_ret
|
flat_rank_rets[r] = rank_ret
|
||||||
|
|
||||||
|
if use_per_rank_rng:
|
||||||
|
assert lm is not None
|
||||||
|
lm._per_rank_rng_states[r] = _get_rng_state()
|
||||||
|
|
||||||
if default_value is None and func is torch.ops.aten.split.Tensor:
|
if default_value is None and func is torch.ops.aten.split.Tensor:
|
||||||
# If split happens over the dimension smaller than the number of chunks
|
# If split happens over the dimension smaller than the number of chunks
|
||||||
# it is possible that some ranks will produce shorter lists of chunks.
|
# it is possible that some ranks will produce shorter lists of chunks.
|
||||||
@ -437,6 +457,247 @@ class LocalIntNode:
|
|||||||
return ConstantIntNode(num)
|
return ConstantIntNode(num)
|
||||||
|
|
||||||
|
|
||||||
|
class _LocalDeviceHandle:
|
||||||
|
"""
|
||||||
|
Wrapper around device module (e.g., torch.cuda) with automatic LocalTensor semantics.
|
||||||
|
|
||||||
|
This class wraps device modules and automatically handles per-rank operations in
|
||||||
|
LocalTensor mode:
|
||||||
|
- get_rng_state() returns a LocalTensor with per-rank states
|
||||||
|
- set_rng_state(LocalTensor) sets per-rank states
|
||||||
|
|
||||||
|
When not in LocalTensor mode, it delegates directly to the underlying device handle.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, device_handle, device_type: str):
|
||||||
|
"""
|
||||||
|
Initialize the local device handle wrapper.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
device_handle: The underlying device module (e.g., torch.cuda)
|
||||||
|
device_type: Device type string (e.g., "cuda", "cpu")
|
||||||
|
"""
|
||||||
|
self._device_handle = device_handle
|
||||||
|
self._device_type = device_type
|
||||||
|
|
||||||
|
def get_rng_state(self):
|
||||||
|
"""
|
||||||
|
Get RNG state, automatically returning LocalTensor in LocalTensor mode.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
LocalTensor in LocalTensor mode, regular Tensor otherwise
|
||||||
|
"""
|
||||||
|
lm = enabled_local_tensor_mode()
|
||||||
|
if not lm:
|
||||||
|
return self._device_handle.get_rng_state()
|
||||||
|
|
||||||
|
original_state = _get_rng_state()
|
||||||
|
per_rank_states = {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
for rank in lm.ranks:
|
||||||
|
# We need to set-then-get instead of directly copying lm._per_rank_rng_states[rank]
|
||||||
|
# because they have different structures:
|
||||||
|
# - lm._per_rank_rng_states[rank] is a tuple: (cpu_state, {device_idx: cuda_state})
|
||||||
|
# - self._device_handle.get_rng_state() returns just the device-specific tensor
|
||||||
|
# So we temporarily restore the full RNG state (CPU + all CUDA devices) for this rank,
|
||||||
|
# then extract only the specific device's state tensor that we need.
|
||||||
|
if rank in lm._per_rank_rng_states:
|
||||||
|
_set_rng_state(*lm._per_rank_rng_states[rank])
|
||||||
|
|
||||||
|
per_rank_states[rank] = self._device_handle.get_rng_state()
|
||||||
|
finally:
|
||||||
|
_set_rng_state(*original_state)
|
||||||
|
|
||||||
|
# pyrefly: ignore [bad-argument-type, bad-argument-count]
|
||||||
|
return LocalTensor(per_rank_states)
|
||||||
|
|
||||||
|
def set_rng_state(self, state):
|
||||||
|
"""
|
||||||
|
Set RNG state, automatically handling LocalTensor input.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
state: Regular Tensor or LocalTensor with per-rank states
|
||||||
|
"""
|
||||||
|
if isinstance(state, LocalTensor):
|
||||||
|
lm = enabled_local_tensor_mode()
|
||||||
|
assert lm is not None
|
||||||
|
|
||||||
|
# Similar to get_rng_state but in reverse: we need to convert from
|
||||||
|
# device-specific tensor format to full state tuple format.
|
||||||
|
# - state._local_tensors[rank] contains just the device-specific RNG state tensor
|
||||||
|
# - lm._per_rank_rng_states[rank] needs a tuple: (cpu_state, {device_idx: cuda_state})
|
||||||
|
# So we set the device's state with the rank-specific tensor, then _get_rng_state()
|
||||||
|
# captures both CPU and CUDA states into the tuple format that _per_rank_rng_states expects.
|
||||||
|
for rank, rank_state in state._local_tensors.items():
|
||||||
|
self._device_handle.set_rng_state(rank_state.to("cpu"))
|
||||||
|
lm._per_rank_rng_states[rank] = _get_rng_state()
|
||||||
|
else:
|
||||||
|
self._device_handle.set_rng_state(state.to("cpu"))
|
||||||
|
|
||||||
|
def __getattr__(self, name):
|
||||||
|
"""Delegate all other attributes to the underlying device module."""
|
||||||
|
return getattr(self._device_handle, name)
|
||||||
|
|
||||||
|
|
||||||
|
class _LocalOffsetBasedRNGTracker:
|
||||||
|
"""
|
||||||
|
LocalTensor-specific RNG tracker for DTensor random operations.
|
||||||
|
|
||||||
|
This class manages per-rank RNG states when running in LocalTensor mode,
|
||||||
|
using _LocalPhiloxState to track different offsets for each virtual rank.
|
||||||
|
It is instantiated and used by OffsetBasedRNGTracker when in LocalTensor mode.
|
||||||
|
|
||||||
|
Much of this is derived from OffsetBasedRNGTracker:
|
||||||
|
https://github.com/pytorch/pytorch/blob/402c46503002f98ccfc023a733081fb0719223a1/torch/distributed/tensor/_random.py#L182
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, device_type: str = "cuda"):
|
||||||
|
"""Initialize the LocalTensor RNG tracker."""
|
||||||
|
from torch.distributed.device_mesh import _get_device_handle
|
||||||
|
|
||||||
|
self._device_type = device_type
|
||||||
|
self._device_handle = _LocalDeviceHandle(
|
||||||
|
_get_device_handle(device_type), device_type
|
||||||
|
)
|
||||||
|
self.distribute_region_enabled = True
|
||||||
|
self._device_mesh = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def _device(self):
|
||||||
|
return torch.device(self._device_type, torch.cuda.current_device())
|
||||||
|
|
||||||
|
def _set_pre_op_offset(self, state, spec) -> None:
|
||||||
|
"""Compute and set per-rank offsets before the random operation."""
|
||||||
|
from torch.distributed.tensor._ops.utils import prod
|
||||||
|
from torch.distributed.tensor._utils import (
|
||||||
|
_compute_local_shape_and_global_offset,
|
||||||
|
)
|
||||||
|
from torch.distributed.tensor.placement_types import Shard
|
||||||
|
|
||||||
|
lm = enabled_local_tensor_mode()
|
||||||
|
assert lm is not None
|
||||||
|
|
||||||
|
state._per_rank_offsets = {}
|
||||||
|
|
||||||
|
for rank in lm.ranks:
|
||||||
|
# compute this rank's coordinate in the mesh
|
||||||
|
mesh_coords = []
|
||||||
|
for mesh_dim_idx in range(spec.mesh.ndim):
|
||||||
|
mesh_dim_size = spec.mesh.size(mesh_dim_idx)
|
||||||
|
# calculate rank's coordinate in this mesh dimension
|
||||||
|
num_chunks_after = 1
|
||||||
|
for j in range(mesh_dim_idx + 1, spec.mesh.ndim):
|
||||||
|
num_chunks_after *= spec.mesh.size(j)
|
||||||
|
coord = (rank // num_chunks_after) % mesh_dim_size
|
||||||
|
mesh_coords.append(coord)
|
||||||
|
|
||||||
|
# compute local shape and global offset for this rank
|
||||||
|
local_shape, global_offset = _compute_local_shape_and_global_offset(
|
||||||
|
spec.shape, spec.mesh.shape, mesh_coords, spec.placements
|
||||||
|
)
|
||||||
|
|
||||||
|
# compute shard offset based on placements
|
||||||
|
shard_offset = 1
|
||||||
|
for idx, placement in enumerate(spec.placements):
|
||||||
|
if isinstance(placement, Shard):
|
||||||
|
shard_dim = placement.dim
|
||||||
|
shard_offset *= global_offset[shard_dim] + 1
|
||||||
|
|
||||||
|
# get current offset for this rank
|
||||||
|
current_offset = int(
|
||||||
|
state._per_rank_states[rank][8:].view(dtype=torch.int64).item()
|
||||||
|
)
|
||||||
|
|
||||||
|
# compute local size
|
||||||
|
local_size = prod(local_shape)
|
||||||
|
|
||||||
|
# compute new offset (must be multiple of 4)
|
||||||
|
shard_linear_idx = shard_offset - 1
|
||||||
|
offset_incr = (shard_linear_idx * local_size + 3) // 4 * 4
|
||||||
|
state._per_rank_offsets[rank] = current_offset + offset_incr
|
||||||
|
|
||||||
|
def _set_post_op_offset(self, state, spec, old_offset) -> None:
|
||||||
|
"""Set per-rank offsets after the random operation."""
|
||||||
|
from torch.distributed.tensor._ops.utils import prod
|
||||||
|
|
||||||
|
lm = enabled_local_tensor_mode()
|
||||||
|
assert lm is not None
|
||||||
|
|
||||||
|
dtensor_shape = spec.shape
|
||||||
|
numel = prod(dtensor_shape)
|
||||||
|
# offset must be multiple of 4
|
||||||
|
numel = (numel + 3) // 4 * 4
|
||||||
|
|
||||||
|
if not hasattr(state, "_per_rank_offsets"):
|
||||||
|
state._per_rank_offsets = {}
|
||||||
|
|
||||||
|
# handle LocalIntNode old_offset (different values per rank)
|
||||||
|
if isinstance(old_offset, SymInt) and isinstance(old_offset.node, LocalIntNode):
|
||||||
|
for rank in lm.ranks:
|
||||||
|
rank_old_offset = old_offset.node._local_ints[rank]
|
||||||
|
state._per_rank_offsets[rank] = rank_old_offset + numel
|
||||||
|
else:
|
||||||
|
# same old_offset for all ranks
|
||||||
|
old_offset_int = (
|
||||||
|
int(old_offset) if isinstance(old_offset, SymInt) else old_offset
|
||||||
|
)
|
||||||
|
for rank in lm.ranks:
|
||||||
|
state._per_rank_offsets[rank] = old_offset_int + numel
|
||||||
|
|
||||||
|
@contextlib.contextmanager
|
||||||
|
def _distribute_region(self, spec, generator=None):
|
||||||
|
"""Context manager for LocalTensor mode distribute region."""
|
||||||
|
lm = enabled_local_tensor_mode()
|
||||||
|
assert lm is not None
|
||||||
|
|
||||||
|
# get base state
|
||||||
|
if generator is not None:
|
||||||
|
base_state_tensor = generator.get_state()
|
||||||
|
per_rank_states = {rank: base_state_tensor.clone() for rank in lm.ranks}
|
||||||
|
# pyrefly: ignore [bad-argument-type, bad-argument-count]
|
||||||
|
base_state_tensor = LocalTensor(per_rank_states)
|
||||||
|
else:
|
||||||
|
base_state_tensor = self._device_handle.get_rng_state()
|
||||||
|
|
||||||
|
state = _LocalPhiloxState(base_state_tensor)
|
||||||
|
|
||||||
|
if self.distribute_region_enabled:
|
||||||
|
# sync to rank 0's state if no explicit generator
|
||||||
|
if generator is None:
|
||||||
|
rank_0_state = lm._per_rank_rng_states[0]
|
||||||
|
rank_0_cpu, rank_0_cuda = rank_0_state
|
||||||
|
|
||||||
|
if self._device.type == "cuda":
|
||||||
|
assert self._device.index in rank_0_cuda
|
||||||
|
rank_0_device_state = rank_0_cuda[self._device.index]
|
||||||
|
else:
|
||||||
|
rank_0_device_state = rank_0_cpu
|
||||||
|
|
||||||
|
from torch.distributed.tensor._random import _PhiloxState
|
||||||
|
|
||||||
|
rank_0_philox = _PhiloxState(rank_0_device_state)
|
||||||
|
state.seed = rank_0_philox.seed
|
||||||
|
state.offset = rank_0_philox.offset
|
||||||
|
|
||||||
|
old_offset = state.offset
|
||||||
|
self._set_pre_op_offset(state, spec)
|
||||||
|
state.apply_to_local_tensor_mode(self._device_handle)
|
||||||
|
|
||||||
|
try:
|
||||||
|
yield
|
||||||
|
finally:
|
||||||
|
self._set_post_op_offset(state, spec, old_offset)
|
||||||
|
state.apply_to_local_tensor_mode(self._device_handle)
|
||||||
|
else:
|
||||||
|
yield
|
||||||
|
|
||||||
|
# maybe reset generator to rank 0's state
|
||||||
|
if generator is not None:
|
||||||
|
rank_0_state = state._per_rank_states[0]
|
||||||
|
generator.set_state(rank_0_state)
|
||||||
|
|
||||||
|
|
||||||
_LOCAL_TENSOR_ATTR_PREFIX = "_local_tensor_"
|
_LOCAL_TENSOR_ATTR_PREFIX = "_local_tensor_"
|
||||||
|
|
||||||
|
|
||||||
@ -597,6 +858,7 @@ class LocalTensor(torch.Tensor):
|
|||||||
local_tensors_copy = {
|
local_tensors_copy = {
|
||||||
r: copy.deepcopy(t, memo) for r, t in self._local_tensors.items()
|
r: copy.deepcopy(t, memo) for r, t in self._local_tensors.items()
|
||||||
}
|
}
|
||||||
|
# pyrefly: ignore [bad-argument-type, bad-argument-count]
|
||||||
return LocalTensor(local_tensors_copy, self.requires_grad)
|
return LocalTensor(local_tensors_copy, self.requires_grad)
|
||||||
|
|
||||||
def __repr__(self) -> str: # type: ignore[override]
|
def __repr__(self) -> str: # type: ignore[override]
|
||||||
@ -636,6 +898,7 @@ class LocalTensor(torch.Tensor):
|
|||||||
local_tensors = {
|
local_tensors = {
|
||||||
_from_local_tensor_attr(a): t for a, t in inner_tensors.items()
|
_from_local_tensor_attr(a): t for a, t in inner_tensors.items()
|
||||||
}
|
}
|
||||||
|
# pyrefly: ignore [bad-argument-type, bad-argument-count]
|
||||||
return LocalTensor(local_tensors)
|
return LocalTensor(local_tensors)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@ -774,12 +1037,28 @@ class LocalTensorMode(TorchDispatchMode):
|
|||||||
self.ranks = ranks
|
self.ranks = ranks
|
||||||
self._disable = False
|
self._disable = False
|
||||||
self._old_get_coordinate = None
|
self._old_get_coordinate = None
|
||||||
|
self._old_torch_manual_seed: Any = None
|
||||||
|
self._old_torch_initial_seed: Any = None
|
||||||
|
self._per_rank_rng_states: dict[
|
||||||
|
int, tuple[torch.Tensor, dict[int, torch.Tensor]]
|
||||||
|
] = {}
|
||||||
|
|
||||||
def __enter__(self) -> "LocalTensorMode":
|
def __enter__(self) -> "LocalTensorMode":
|
||||||
self._disable = False
|
self._disable = False
|
||||||
self._patch_device_mesh()
|
self._patch_device_mesh()
|
||||||
|
self._patch_random_functions()
|
||||||
_LOCAL_TENSOR_MODE.append(self)
|
_LOCAL_TENSOR_MODE.append(self)
|
||||||
|
|
||||||
|
# _distribute_region will compute correct per-shard offsets
|
||||||
|
# but we want all ranks to start with the same state
|
||||||
|
if not _is_in_fake_tensor_mode():
|
||||||
|
cpu_state, cuda_states = _get_rng_state()
|
||||||
|
for rank in self.ranks:
|
||||||
|
self._per_rank_rng_states[rank] = (
|
||||||
|
cpu_state.clone(),
|
||||||
|
{idx: state.clone() for idx, state in cuda_states.items()},
|
||||||
|
)
|
||||||
|
|
||||||
return super().__enter__()
|
return super().__enter__()
|
||||||
|
|
||||||
def __exit__(
|
def __exit__(
|
||||||
@ -790,6 +1069,7 @@ class LocalTensorMode(TorchDispatchMode):
|
|||||||
) -> None:
|
) -> None:
|
||||||
self._disable = True
|
self._disable = True
|
||||||
self._unpatch_device_mesh()
|
self._unpatch_device_mesh()
|
||||||
|
self._unpatch_random_functions()
|
||||||
_LOCAL_TENSOR_MODE.pop()
|
_LOCAL_TENSOR_MODE.pop()
|
||||||
super().__exit__(exc_type, exc_val, exc_tb)
|
super().__exit__(exc_type, exc_val, exc_tb)
|
||||||
|
|
||||||
@ -936,6 +1216,7 @@ class LocalTensorMode(TorchDispatchMode):
|
|||||||
m = cb(r, tensor._local_tensors[r])
|
m = cb(r, tensor._local_tensors[r])
|
||||||
if m is not None:
|
if m is not None:
|
||||||
results[r] = m
|
results[r] = m
|
||||||
|
# pyrefly: ignore [bad-argument-type, bad-argument-count]
|
||||||
return LocalTensor(results)
|
return LocalTensor(results)
|
||||||
|
|
||||||
def _patch_device_mesh(self) -> None:
|
def _patch_device_mesh(self) -> None:
|
||||||
@ -949,6 +1230,87 @@ class LocalTensorMode(TorchDispatchMode):
|
|||||||
# pyrefly: ignore [bad-assignment]
|
# pyrefly: ignore [bad-assignment]
|
||||||
self._old_get_coordinate = None
|
self._old_get_coordinate = None
|
||||||
|
|
||||||
|
def _patch_random_functions(self) -> None:
|
||||||
|
import torch.random
|
||||||
|
from torch.distributed.tensor import _random as dtensor_random
|
||||||
|
|
||||||
|
if self._old_torch_manual_seed is None:
|
||||||
|
self._old_torch_manual_seed = torch.random.manual_seed
|
||||||
|
torch.random.manual_seed = _LocalRandom.torch_manual_seed
|
||||||
|
torch.manual_seed = _LocalRandom.torch_manual_seed
|
||||||
|
|
||||||
|
if self._old_torch_initial_seed is None:
|
||||||
|
self._old_torch_initial_seed = torch.random.initial_seed
|
||||||
|
torch.random.initial_seed = _LocalRandom.torch_initial_seed
|
||||||
|
torch.initial_seed = _LocalRandom.torch_initial_seed
|
||||||
|
|
||||||
|
def _unpatch_random_functions(self) -> None:
|
||||||
|
import torch.random
|
||||||
|
from torch.distributed.tensor import _random as dtensor_random
|
||||||
|
|
||||||
|
if self._old_torch_manual_seed is not None:
|
||||||
|
torch.random.manual_seed = self._old_torch_manual_seed
|
||||||
|
torch.manual_seed = self._old_torch_manual_seed
|
||||||
|
self._old_torch_manual_seed = None
|
||||||
|
|
||||||
|
if self._old_torch_initial_seed is not None:
|
||||||
|
torch.random.initial_seed = self._old_torch_initial_seed
|
||||||
|
torch.initial_seed = self._old_torch_initial_seed
|
||||||
|
self._old_torch_initial_seed = None
|
||||||
|
|
||||||
|
|
||||||
|
class _LocalRandom:
|
||||||
|
"""
|
||||||
|
Holds implementations of random functionality that must be patched while running
|
||||||
|
under LocalTensorMode.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def torch_manual_seed(seed) -> torch._C.Generator:
|
||||||
|
"""LocalTensor-aware version of torch.random.manual_seed."""
|
||||||
|
if (
|
||||||
|
(lm := enabled_local_tensor_mode())
|
||||||
|
and isinstance(seed, torch.SymInt)
|
||||||
|
and isinstance(seed.node, LocalIntNode)
|
||||||
|
):
|
||||||
|
from torch.random import _manual_seed_impl
|
||||||
|
|
||||||
|
for rank in sorted(lm.ranks):
|
||||||
|
rank_seed = seed.node._local_ints[rank]
|
||||||
|
_manual_seed_impl(rank_seed, update_local_tensor_states=False)
|
||||||
|
lm._per_rank_rng_states[rank] = _get_rng_state()
|
||||||
|
return torch.random.default_generator
|
||||||
|
from torch.random import _manual_seed_impl
|
||||||
|
|
||||||
|
result = _manual_seed_impl(seed, update_local_tensor_states=False)
|
||||||
|
|
||||||
|
if lm is not None and len(lm._per_rank_rng_states) > 0:
|
||||||
|
cpu_state, cuda_states = _get_rng_state()
|
||||||
|
for rank in lm.ranks:
|
||||||
|
lm._per_rank_rng_states[rank] = (
|
||||||
|
cpu_state.clone(),
|
||||||
|
{idx: state.clone() for idx, state in cuda_states.items()},
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def torch_initial_seed():
|
||||||
|
"""LocalTensor-aware version of torch.random.initial_seed."""
|
||||||
|
if lm := enabled_local_tensor_mode():
|
||||||
|
if len(lm._per_rank_rng_states) == 0:
|
||||||
|
return torch.random.default_generator.initial_seed()
|
||||||
|
rank_seeds = {}
|
||||||
|
|
||||||
|
for rank in sorted(lm.ranks):
|
||||||
|
_set_rng_state(*lm._per_rank_rng_states[rank])
|
||||||
|
rank_seeds[rank] = torch.random.default_generator.initial_seed()
|
||||||
|
|
||||||
|
local_int_node = LocalIntNode(rank_seeds)
|
||||||
|
return torch.SymInt(local_int_node)
|
||||||
|
|
||||||
|
return torch.random.default_generator.initial_seed()
|
||||||
|
|
||||||
|
|
||||||
class _LocalDeviceMesh:
|
class _LocalDeviceMesh:
|
||||||
"""
|
"""
|
||||||
@ -963,7 +1325,7 @@ class _LocalDeviceMesh:
|
|||||||
# doing this because when submesh is created it is created for a particular
|
# doing this because when submesh is created it is created for a particular
|
||||||
# rank (therefore below we are patching get_rank method). We are trying to
|
# rank (therefore below we are patching get_rank method). We are trying to
|
||||||
# limit the invasiveness of local tensor.
|
# limit the invasiveness of local tensor.
|
||||||
lm = local_tensor_mode()
|
lm = enabled_local_tensor_mode()
|
||||||
assert lm is not None, "Unexpectedly not in LocalTensorMode"
|
assert lm is not None, "Unexpectedly not in LocalTensorMode"
|
||||||
|
|
||||||
coords: list[dict[int, int]] = [{} for _ in range(self.ndim)]
|
coords: list[dict[int, int]] = [{} for _ in range(self.ndim)]
|
||||||
@ -1024,6 +1386,22 @@ def local_tensor_mode() -> Optional[LocalTensorMode]:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def enabled_local_tensor_mode() -> Optional[LocalTensorMode]:
|
||||||
|
"""
|
||||||
|
Returns the current active LocalTensorMode only if it's enabled.
|
||||||
|
|
||||||
|
This is a convenience function that combines the common pattern of checking
|
||||||
|
if local_tensor_mode() is not None and not disabled.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Optional[LocalTensorMode]: The current LocalTensorMode if active and enabled, else None.
|
||||||
|
"""
|
||||||
|
lm = local_tensor_mode()
|
||||||
|
if lm is not None and not lm._disable:
|
||||||
|
return lm
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def maybe_run_for_local_tensor(func: Callable[..., Any]) -> Callable[..., Any]:
|
def maybe_run_for_local_tensor(func: Callable[..., Any]) -> Callable[..., Any]:
|
||||||
"""
|
"""
|
||||||
Decorator that ensures a function is executed for each local tensor shard
|
Decorator that ensures a function is executed for each local tensor shard
|
||||||
@ -1048,8 +1426,7 @@ def maybe_run_for_local_tensor(func: Callable[..., Any]) -> Callable[..., Any]:
|
|||||||
|
|
||||||
@functools.wraps(func)
|
@functools.wraps(func)
|
||||||
def wrapper(*args, **kwargs): # type: ignore[no-untyped-def]
|
def wrapper(*args, **kwargs): # type: ignore[no-untyped-def]
|
||||||
lm = local_tensor_mode()
|
if not (lm := enabled_local_tensor_mode()):
|
||||||
if lm is None or lm._disable:
|
|
||||||
return func(*args, **kwargs)
|
return func(*args, **kwargs)
|
||||||
ret = None
|
ret = None
|
||||||
with lm.disable():
|
with lm.disable():
|
||||||
@ -1068,6 +1445,73 @@ def maybe_disable_local_tensor_mode() -> contextlib.AbstractContextManager:
|
|||||||
return lm.disable() if lm is not None else contextlib.nullcontext()
|
return lm.disable() if lm is not None else contextlib.nullcontext()
|
||||||
|
|
||||||
|
|
||||||
|
def maybe_enable_local_tracker(
|
||||||
|
device_type: str, distribute_region_enabled: bool, spec, generator
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Returns a context manager for LocalTensor-mode RNG tracking if local tensor mode is enabled.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
device_type: The device type (e.g., "cuda", "cpu")
|
||||||
|
distribute_region_enabled: Whether distribute region is enabled
|
||||||
|
spec: The DTensorSpec
|
||||||
|
generator: Optional torch.Generator
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Context manager from local_tracker._distribute_region if local tensor mode is enabled,
|
||||||
|
otherwise None.
|
||||||
|
"""
|
||||||
|
if enabled_local_tensor_mode():
|
||||||
|
local_tracker = _LocalOffsetBasedRNGTracker(device_type)
|
||||||
|
local_tracker.distribute_region_enabled = distribute_region_enabled
|
||||||
|
return local_tracker._distribute_region(spec, generator)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_generator_seed_for_device_type(device_type: str):
|
||||||
|
"""
|
||||||
|
Gets the generator seed for a specific device type, handling LocalTensor mode appropriately.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
device_type: The device type (e.g., "cuda", "cpu")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
If in LocalTensor mode with per-rank RNG states:
|
||||||
|
- Returns int if all ranks have the same seed
|
||||||
|
- Returns SymInt(LocalIntNode) if ranks have different seeds
|
||||||
|
Otherwise:
|
||||||
|
- Returns int seed from the device's RNG state
|
||||||
|
"""
|
||||||
|
if lm := enabled_local_tensor_mode():
|
||||||
|
if len(lm._per_rank_rng_states) == 0:
|
||||||
|
device_module = torch.get_device_module(device_type)
|
||||||
|
return device_module.get_rng_state()[:8].view(torch.int64).item()
|
||||||
|
device_module = torch.get_device_module(device_type)
|
||||||
|
|
||||||
|
original_state = _get_rng_state()
|
||||||
|
|
||||||
|
rank_seeds = {}
|
||||||
|
try:
|
||||||
|
for rank in sorted(lm.ranks):
|
||||||
|
_set_rng_state(*lm._per_rank_rng_states[rank])
|
||||||
|
rank_seeds[rank] = int(
|
||||||
|
device_module.get_rng_state()[:8].view(torch.int64).item()
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
# restore original state
|
||||||
|
_set_rng_state(*original_state)
|
||||||
|
|
||||||
|
unique_seeds = set(rank_seeds.values())
|
||||||
|
if len(unique_seeds) == 1:
|
||||||
|
return next(iter(unique_seeds))
|
||||||
|
local_int_node = LocalIntNode(rank_seeds)
|
||||||
|
return torch.SymInt(local_int_node)
|
||||||
|
else:
|
||||||
|
device_module = torch.get_device_module(device_type)
|
||||||
|
return device_module.get_rng_state()[:8].view(torch.int64).item()
|
||||||
|
|
||||||
|
|
||||||
import threading
|
import threading
|
||||||
from queue import Queue
|
from queue import Queue
|
||||||
|
|
||||||
@ -1183,3 +1627,114 @@ class LocalRunnerMode:
|
|||||||
global _LOCAL_RUNNER_MODE
|
global _LOCAL_RUNNER_MODE
|
||||||
assert _LOCAL_RUNNER_MODE is not None, "LocalRunnerMode is not enabled"
|
assert _LOCAL_RUNNER_MODE is not None, "LocalRunnerMode is not enabled"
|
||||||
return _LOCAL_RUNNER_MODE
|
return _LOCAL_RUNNER_MODE
|
||||||
|
|
||||||
|
|
||||||
|
class _LocalPhiloxState:
|
||||||
|
"""
|
||||||
|
LocalTensor-aware version of _PhiloxState that manages per-rank RNG states.
|
||||||
|
This class handles the case where the generator state is a LocalTensor, allowing
|
||||||
|
different offsets and seeds for different virtual ranks.
|
||||||
|
|
||||||
|
Note: This is designed to be used as a drop-in replacement for _PhiloxState
|
||||||
|
when working with LocalTensors in the DTensor random ops implementation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, state: torch.Tensor):
|
||||||
|
assert isinstance(state, LocalTensor), (
|
||||||
|
"_LocalPhiloxState requires a LocalTensor"
|
||||||
|
)
|
||||||
|
self._local_tensor = state
|
||||||
|
self._per_rank_states = {
|
||||||
|
rank: local_state.to("cpu")
|
||||||
|
for rank, local_state in state._local_tensors.items()
|
||||||
|
}
|
||||||
|
|
||||||
|
@property
|
||||||
|
def state(self):
|
||||||
|
return LocalTensor(self._per_rank_states) # type: ignore[name-defined]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def offset(self) -> Union[int, SymInt]:
|
||||||
|
from torch.distributed.tensor._random import _PhiloxState
|
||||||
|
|
||||||
|
offsets = {}
|
||||||
|
for rank, state in self._per_rank_states.items():
|
||||||
|
rank_philox = _PhiloxState(state)
|
||||||
|
offsets[rank] = rank_philox.offset
|
||||||
|
|
||||||
|
if len(set(offsets.values())) == 1:
|
||||||
|
return next(iter(offsets.values()))
|
||||||
|
# pyrefly: ignore [bad-argument-type, bad-argument-count]
|
||||||
|
return SymInt(LocalIntNode(offsets))
|
||||||
|
|
||||||
|
@offset.setter
|
||||||
|
def offset(self, offset: Union[int, SymInt]) -> None:
|
||||||
|
from torch.distributed.tensor._random import _PhiloxState
|
||||||
|
|
||||||
|
if isinstance(offset, SymInt) and isinstance(offset.node, LocalIntNode):
|
||||||
|
for rank, state in self._per_rank_states.items():
|
||||||
|
rank_offset = offset.node._local_ints[rank]
|
||||||
|
rank_philox = _PhiloxState(state)
|
||||||
|
rank_philox.offset = rank_offset
|
||||||
|
else:
|
||||||
|
offset_int = int(offset) if isinstance(offset, SymInt) else offset
|
||||||
|
for state in self._per_rank_states.values():
|
||||||
|
rank_philox = _PhiloxState(state)
|
||||||
|
rank_philox.offset = offset_int
|
||||||
|
|
||||||
|
@property
|
||||||
|
def seed(self) -> Union[int, SymInt]:
|
||||||
|
from torch.distributed.tensor._random import _PhiloxState
|
||||||
|
|
||||||
|
seeds = {}
|
||||||
|
for rank, state in self._per_rank_states.items():
|
||||||
|
rank_philox = _PhiloxState(state)
|
||||||
|
seeds[rank] = rank_philox.seed
|
||||||
|
|
||||||
|
if len(set(seeds.values())) == 1:
|
||||||
|
return next(iter(seeds.values()))
|
||||||
|
return SymInt(LocalIntNode(seeds))
|
||||||
|
|
||||||
|
@seed.setter
|
||||||
|
def seed(self, seed: Union[int, SymInt]) -> None:
|
||||||
|
from torch.distributed.tensor._random import _PhiloxState
|
||||||
|
|
||||||
|
if isinstance(seed, SymInt) and isinstance(seed.node, LocalIntNode):
|
||||||
|
for rank, state in self._per_rank_states.items():
|
||||||
|
rank_seed = seed.node._local_ints[rank]
|
||||||
|
rank_philox = _PhiloxState(state)
|
||||||
|
rank_philox.seed = rank_seed
|
||||||
|
else:
|
||||||
|
seed_int = int(seed) if isinstance(seed, SymInt) else seed
|
||||||
|
for state in self._per_rank_states.values():
|
||||||
|
rank_philox = _PhiloxState(state)
|
||||||
|
rank_philox.seed = seed_int
|
||||||
|
|
||||||
|
def apply_to_local_tensor_mode(self, device_handle) -> None:
|
||||||
|
"""
|
||||||
|
Apply per-rank RNG states to the LocalTensorMode's tracked states.
|
||||||
|
This updates both the device RNG state and the LocalTensorMode's _per_rank_rng_states.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
device_handle: The device handle to use for setting RNG state (_LocalDeviceHandle)
|
||||||
|
"""
|
||||||
|
if not enabled_local_tensor_mode():
|
||||||
|
return
|
||||||
|
|
||||||
|
assert hasattr(self, "_per_rank_offsets")
|
||||||
|
|
||||||
|
for rank in sorted(self._per_rank_states.keys()):
|
||||||
|
offset_value = self._per_rank_offsets[rank]
|
||||||
|
if isinstance(offset_value, SymInt):
|
||||||
|
if isinstance(offset_value.node, LocalIntNode):
|
||||||
|
offset_value = offset_value.node._local_ints[rank]
|
||||||
|
else:
|
||||||
|
offset_value = int(offset_value)
|
||||||
|
|
||||||
|
offset_tensor = torch.tensor(
|
||||||
|
[offset_value], dtype=torch.uint64, device="cpu"
|
||||||
|
).view(torch.uint8)
|
||||||
|
self._per_rank_states[rank][8:] = offset_tensor
|
||||||
|
|
||||||
|
# pyrefly: ignore [bad-argument-type, bad-argument-count]
|
||||||
|
device_handle.set_rng_state(LocalTensor(self._per_rank_states))
|
||||||
|
|||||||
@ -547,8 +547,12 @@ def foreach_reduce(
|
|||||||
op=reduce_scatter_op,
|
op=reduce_scatter_op,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# For single GPU, just copy the input to output (no actual reduce-scatter needed)
|
# For single GPU, just copy the input to output (no actual reduce-scatter needed), and
|
||||||
reduce_output.copy_(reduce_scatter_input)
|
# account for a possible gradient_divide_factor.
|
||||||
|
if gradient_divide_factor is not None:
|
||||||
|
reduce_output.copy_(reduce_scatter_input / gradient_divide_factor)
|
||||||
|
else:
|
||||||
|
reduce_output.copy_(reduce_scatter_input)
|
||||||
reduce_scatter_event = reduce_scatter_stream.record_event()
|
reduce_scatter_event = reduce_scatter_stream.record_event()
|
||||||
post_reduce_stream = reduce_scatter_stream
|
post_reduce_stream = reduce_scatter_stream
|
||||||
if all_reduce_group is not None: # HSDP or DDP/replicate
|
if all_reduce_group is not None: # HSDP or DDP/replicate
|
||||||
@ -721,20 +725,21 @@ def _get_gradient_divide_factors(
|
|||||||
if all_reduce_group is not None:
|
if all_reduce_group is not None:
|
||||||
data_parallel_size *= all_reduce_group.size()
|
data_parallel_size *= all_reduce_group.size()
|
||||||
|
|
||||||
if factor is None:
|
|
||||||
factor = float(data_parallel_size)
|
|
||||||
|
|
||||||
if not overflow_risk and not force_sum_reduction_for_comms:
|
if not overflow_risk and not force_sum_reduction_for_comms:
|
||||||
if factor == data_parallel_size:
|
if factor is None:
|
||||||
# Warning: NCCL ReduceOp.AVG may produce incorrect results with
|
# Warning: NCCL ReduceOp.AVG may produce incorrect results with
|
||||||
# world size 1.
|
# world size 1.
|
||||||
if data_parallel_size == 1:
|
if data_parallel_size == 1:
|
||||||
return None, None, ReduceOp.SUM, ReduceOp.SUM
|
return None, None, ReduceOp.SUM, ReduceOp.SUM
|
||||||
return None, None, ReduceOp.AVG, ReduceOp.AVG
|
return None, None, ReduceOp.AVG, ReduceOp.AVG
|
||||||
|
if reduce_scatter_group is not None and factor == reduce_scatter_group.size():
|
||||||
|
reduce_scatter_op = ReduceOp.AVG
|
||||||
else:
|
else:
|
||||||
reduce_scatter_op = torch.distributed._make_nccl_premul_sum(1 / factor)
|
reduce_scatter_op = torch.distributed._make_nccl_premul_sum(1 / factor)
|
||||||
return None, None, reduce_scatter_op, ReduceOp.SUM
|
return None, None, reduce_scatter_op, ReduceOp.SUM
|
||||||
|
|
||||||
|
if factor is None:
|
||||||
|
factor = float(data_parallel_size)
|
||||||
pre_factor: Optional[float]
|
pre_factor: Optional[float]
|
||||||
if overflow_risk:
|
if overflow_risk:
|
||||||
# Since fp16 has smaller dynamic range than fp32/bf16, we want to avoid
|
# Since fp16 has smaller dynamic range than fp32/bf16, we want to avoid
|
||||||
|
|||||||
@ -135,7 +135,9 @@ class OpDispatcher:
|
|||||||
self._random_ops = {
|
self._random_ops = {
|
||||||
aten.native_dropout.default,
|
aten.native_dropout.default,
|
||||||
aten.normal_.default,
|
aten.normal_.default,
|
||||||
|
aten.rand.default,
|
||||||
aten.rand_like.default,
|
aten.rand_like.default,
|
||||||
|
aten.randn.default,
|
||||||
aten.randn_like.default,
|
aten.randn_like.default,
|
||||||
aten.randint_like.default,
|
aten.randint_like.default,
|
||||||
aten.randint_like.low_dtype,
|
aten.randint_like.low_dtype,
|
||||||
|
|||||||
@ -101,6 +101,9 @@ def manual_seed(seed: int, device_mesh: DeviceMesh) -> None:
|
|||||||
|
|
||||||
# DTensor no longer maintains a copy of rng state. manual seed on dtensor is the same thing
|
# DTensor no longer maintains a copy of rng state. manual seed on dtensor is the same thing
|
||||||
# as manual seed on torch.
|
# as manual seed on torch.
|
||||||
|
#
|
||||||
|
# torch.manual_seed will handle LocalTensor mode correctly by
|
||||||
|
# iterating through all ranks if seed is a LocalIntNode.
|
||||||
torch.manual_seed(seed)
|
torch.manual_seed(seed)
|
||||||
|
|
||||||
|
|
||||||
@ -239,6 +242,16 @@ class OffsetBasedRNGTracker(_RNGStateTracker):
|
|||||||
def _distribute_region(
|
def _distribute_region(
|
||||||
self, spec: DTensorSpec, generator: Optional[torch.Generator] = None
|
self, spec: DTensorSpec, generator: Optional[torch.Generator] = None
|
||||||
):
|
):
|
||||||
|
from torch.distributed._local_tensor import maybe_enable_local_tracker
|
||||||
|
|
||||||
|
if local_tracker_context := maybe_enable_local_tracker(
|
||||||
|
self._device.type, self.distribute_region_enabled, spec, generator
|
||||||
|
):
|
||||||
|
with local_tracker_context:
|
||||||
|
yield
|
||||||
|
return
|
||||||
|
|
||||||
|
# regular (non-LocalTensor) mode
|
||||||
if generator is not None:
|
if generator is not None:
|
||||||
# This is a little hacky, but for any user-passed generator, we store its state under a unique key,
|
# This is a little hacky, but for any user-passed generator, we store its state under a unique key,
|
||||||
# not because we need to keep a copy of it but because its the easiest way to make it work with the
|
# not because we need to keep a copy of it but because its the easiest way to make it work with the
|
||||||
|
|||||||
@ -15,113 +15,105 @@ from .graph_signature import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _remove_effect_tokens_from_graph_helper(
|
def _get_custom_obj_for_node(node, inputs_to_lifted_custom_objs, constants):
|
||||||
ep, num_tokens, input_token_names, output_token_names
|
"""Extract the custom object from a node's arguments."""
|
||||||
|
custom_obj_node = node
|
||||||
|
custom_obj_meta = custom_obj_node.meta["val"] # type: ignore[union-attr]
|
||||||
|
assert isinstance(custom_obj_meta, CustomObjArgument)
|
||||||
|
|
||||||
|
if custom_obj_meta.fake_val:
|
||||||
|
return custom_obj_meta.fake_val
|
||||||
|
elif custom_obj_node.name in inputs_to_lifted_custom_objs: # type: ignore[union-attr]
|
||||||
|
return constants[inputs_to_lifted_custom_objs[custom_obj_node.name]] # type: ignore[union-attr]
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f"Unable to find custom obj for node {node}")
|
||||||
|
|
||||||
|
|
||||||
|
def _replace_with_effects_node(
|
||||||
|
node, ep, inputs_to_lifted_custom_objs, output_tokens, input_tokens, module
|
||||||
):
|
):
|
||||||
inputs_to_lifted_custom_objs = ep.graph_signature.inputs_to_lifted_custom_objs
|
"""Replace a with_effects node with the underlying function call."""
|
||||||
|
# Get the input nodes
|
||||||
|
token_node, func, *node_args = node.args
|
||||||
|
if token_node.op == "placeholder":
|
||||||
|
input_tokens.append(token_node)
|
||||||
|
|
||||||
output_node = None
|
assert isinstance(func, (torch._ops.OpOverload, torch._ops.HigherOrderOperator))
|
||||||
with_effect_nodes: list[torch.fx.Node] = []
|
|
||||||
|
|
||||||
# Output node need to check its args against output_token_names (collected from output_spec)
|
# Get the schema for the function
|
||||||
# Therefore, we only need to find the top-levele output node
|
if func is torch.ops.higher_order.call_torchbind:
|
||||||
output_node = next(reversed(ep.graph_module.graph.find_nodes(op="output")))
|
custom_obj = _get_custom_obj_for_node(
|
||||||
for module in ep.graph_module.modules():
|
node_args[0], inputs_to_lifted_custom_objs, ep.constants
|
||||||
if not isinstance(module, torch.fx.GraphModule):
|
)
|
||||||
continue
|
schema = _get_schema(func, [custom_obj] + node_args[1:])
|
||||||
|
else:
|
||||||
|
schema = _get_schema(func, node_args)
|
||||||
|
|
||||||
for node in module.graph.nodes:
|
# Create the replacement node
|
||||||
if not (node.op == "call_function" and node.target is with_effects):
|
with module.graph.inserting_before(node):
|
||||||
continue
|
new_node = module.graph.call_function(func, tuple(node_args), node.kwargs)
|
||||||
|
|
||||||
with_effect_nodes.append(node)
|
# Update getitem nodes that extract outputs from with_effects
|
||||||
|
for user in list(node.users.keys()):
|
||||||
|
assert user.target is operator.getitem
|
||||||
|
# getitem(with_effects, 0) is the token node
|
||||||
|
if user.args[1] == 0:
|
||||||
|
for user_user in list(user.users.keys()):
|
||||||
|
if user_user.op == "output":
|
||||||
|
output_tokens.append(user)
|
||||||
|
|
||||||
# Remove tokens from outputs
|
# Fix up the getitem nodes based on return count
|
||||||
assert output_node is not None
|
if len(schema.returns) == 1:
|
||||||
output_args = output_node.args[0]
|
# Single return: replace getitem(with_effects, 1) with the node itself
|
||||||
assert len(output_args) >= num_tokens
|
for user in list(node.users.keys()):
|
||||||
out_token_nodes = output_args[:num_tokens]
|
if user.args[1] == 1:
|
||||||
output_node.args = (tuple(output_args[num_tokens:]),)
|
|
||||||
for out_token in out_token_nodes:
|
|
||||||
assert out_token.name in output_token_names
|
|
||||||
out_token.users.clear()
|
|
||||||
ep.graph.erase_node(out_token)
|
|
||||||
|
|
||||||
# Replace with_effects(token, func, args) with just func(args)
|
|
||||||
for node in reversed(with_effect_nodes):
|
|
||||||
func = node.args[1]
|
|
||||||
assert isinstance(func, (torch._ops.OpOverload, torch._ops.HigherOrderOperator))
|
|
||||||
|
|
||||||
if func is torch.ops.higher_order.call_torchbind:
|
|
||||||
custom_obj_meta = node.args[2].meta["val"] # type: ignore[union-attr]
|
|
||||||
assert isinstance(custom_obj_meta, CustomObjArgument)
|
|
||||||
if custom_obj_meta.fake_val:
|
|
||||||
custom_obj = custom_obj_meta.fake_val
|
|
||||||
elif node.args[2].name in inputs_to_lifted_custom_objs: # type: ignore[union-attr]
|
|
||||||
custom_obj = ep.constants[
|
|
||||||
inputs_to_lifted_custom_objs[node.args[2].name] # type: ignore[union-attr]
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
raise RuntimeError(f"Unable to find custom obj for node {node}")
|
|
||||||
schema = _get_schema(func, (custom_obj,) + node.args[3:])
|
|
||||||
else:
|
|
||||||
schema = _get_schema(func, node.args[2:])
|
|
||||||
|
|
||||||
with ep.graph.inserting_before(node):
|
|
||||||
new_node = ep.graph.call_function(func, node.args[2:], node.kwargs)
|
|
||||||
for k, v in node.meta.items():
|
|
||||||
new_node.meta[k] = v
|
|
||||||
if k == "unbacked_bindings":
|
|
||||||
# Remove the extra layer for effect token
|
|
||||||
old_bindings = new_node.meta[k]
|
|
||||||
new_bindings = {
|
|
||||||
k: path[1:] if path else path for k, path in old_bindings.items()
|
|
||||||
}
|
|
||||||
new_node.meta[k] = new_bindings
|
|
||||||
|
|
||||||
node.replace_all_uses_with(new_node)
|
|
||||||
|
|
||||||
# Update user getitem nodes
|
|
||||||
for user in list(new_node.users.keys()):
|
|
||||||
assert user.target is operator.getitem
|
|
||||||
# getitem(with_effects, 0) == token
|
|
||||||
if user.args[1] == 0:
|
|
||||||
ep.graph.erase_node(user)
|
|
||||||
|
|
||||||
if len(schema.returns) == 1:
|
|
||||||
# If the function has 1 return then it will just directly return the
|
|
||||||
# result -- we don't need a getitem. So we can replace all the
|
|
||||||
# getitem(with_effects, 1) with just the note itself.
|
|
||||||
for user in list(new_node.users.keys()):
|
|
||||||
assert user.args[1] == 1
|
|
||||||
user.replace_all_uses_with(new_node)
|
user.replace_all_uses_with(new_node)
|
||||||
|
new_node.meta["val"] = node.meta["val"][1]
|
||||||
|
elif len(schema.returns) > 1:
|
||||||
|
# Multiple returns: shift getitem indices down by 1
|
||||||
|
for user in list(node.users.keys()):
|
||||||
|
if user.args[1] >= 1:
|
||||||
|
user.args = (new_node, user.args[1] - 1)
|
||||||
|
new_node.meta["val"] = node.meta["val"][1:]
|
||||||
|
else:
|
||||||
|
# No returns
|
||||||
|
assert len(schema.returns) == 0
|
||||||
|
assert len(new_node.users) == 0
|
||||||
|
new_node.meta["val"] = None
|
||||||
|
|
||||||
new_node.meta["val"] = node.meta["val"][1]
|
# Copy metadata from old node to new node
|
||||||
elif len(schema.returns) > 1:
|
for k, v in node.meta.items():
|
||||||
# If the function has more than 1 return then since we got rid of
|
new_node.meta[k] = v
|
||||||
# the 1st return value (the token), we need to bump all the other
|
if k == "unbacked_bindings":
|
||||||
# getitem calls by 1 down
|
# Remove the extra layer for effect token
|
||||||
for user in list(new_node.users.keys()):
|
old_bindings = new_node.meta[k]
|
||||||
assert user.args[1] >= 1
|
new_bindings = {
|
||||||
user.args = (user.args[0], user.args[1] - 1)
|
k: path[1:] if path else path for k, path in old_bindings.items()
|
||||||
|
}
|
||||||
|
new_node.meta[k] = new_bindings
|
||||||
|
|
||||||
new_node.meta["val"] = node.meta["val"][1:]
|
|
||||||
else:
|
|
||||||
assert len(schema.returns) == 0
|
|
||||||
assert len(new_node.users) == 0
|
|
||||||
new_node.meta["val"] = None
|
|
||||||
|
|
||||||
ep.graph.erase_node(node)
|
def _replace_invoke_subgraph_node(node, module, output_tokens, input_tokens):
|
||||||
|
"""Replace an invoke_subgraph node to remove the token argument."""
|
||||||
|
assert node.args[0].op == "get_attr"
|
||||||
|
submod = getattr(module, node.args[0].target)
|
||||||
|
if not submod.meta.get("has_with_effects", False):
|
||||||
|
return
|
||||||
|
|
||||||
# Remove tokens from inputs
|
# Remove token from inputs
|
||||||
placeholders = [node for node in ep.graph.nodes if node.op == "placeholder"]
|
subgraph, identifier, token, *operands = node.args
|
||||||
assert len(placeholders) >= num_tokens
|
node.args = (subgraph, identifier, *operands)
|
||||||
inp_token_nodes = placeholders[:num_tokens]
|
if token.op == "placeholder":
|
||||||
for inp_token in inp_token_nodes:
|
input_tokens.append(token)
|
||||||
assert inp_token.name in input_token_names
|
|
||||||
ep.graph.erase_node(inp_token)
|
|
||||||
|
|
||||||
ep.graph.eliminate_dead_code()
|
# Update getitem nodes to account for removed token output
|
||||||
|
for user in list(node.users.keys()):
|
||||||
|
if user.args[1] >= 1:
|
||||||
|
user.args = (node, user.args[1] - 1)
|
||||||
|
elif user.args[1] == 0:
|
||||||
|
for user_user in list(user.users.keys()):
|
||||||
|
if user_user.op == "output":
|
||||||
|
output_tokens.append(user)
|
||||||
|
|
||||||
|
|
||||||
def _remove_effect_tokens(ep: ExportedProgram) -> ExportedProgram:
|
def _remove_effect_tokens(ep: ExportedProgram) -> ExportedProgram:
|
||||||
@ -132,6 +124,65 @@ def _remove_effect_tokens(ep: ExportedProgram) -> ExportedProgram:
|
|||||||
|
|
||||||
This function does an inplace modification on the given ExportedProgram.
|
This function does an inplace modification on the given ExportedProgram.
|
||||||
"""
|
"""
|
||||||
|
print("before", ep)
|
||||||
|
inputs_to_lifted_custom_objs = ep.graph_signature.inputs_to_lifted_custom_objs
|
||||||
|
|
||||||
|
# mark submodules with effects as having effects. This will be used in the following pass to remove effects from subgraphs
|
||||||
|
for _, module in ep.graph_module.named_modules():
|
||||||
|
if not isinstance(module, torch.fx.GraphModule):
|
||||||
|
continue
|
||||||
|
|
||||||
|
with_effect_nodes = [
|
||||||
|
node for node in module.graph.nodes if node.target is with_effects
|
||||||
|
]
|
||||||
|
if len(with_effect_nodes) > 0:
|
||||||
|
module.meta["has_with_effects"] = True
|
||||||
|
|
||||||
|
# Process each module with the replace hook to ensure graph signature is updated
|
||||||
|
with ep.graph_module._set_replace_hook(ep.graph_signature.get_replace_hook()):
|
||||||
|
for _, module in ep.graph_module.named_modules():
|
||||||
|
if not isinstance(module, torch.fx.GraphModule):
|
||||||
|
continue
|
||||||
|
|
||||||
|
input_tokens = []
|
||||||
|
output_tokens = []
|
||||||
|
|
||||||
|
# Process with_effects and invoke_subgraph nodes
|
||||||
|
for node in module.graph.nodes:
|
||||||
|
if node.target is with_effects:
|
||||||
|
_replace_with_effects_node(
|
||||||
|
node,
|
||||||
|
ep,
|
||||||
|
inputs_to_lifted_custom_objs,
|
||||||
|
output_tokens,
|
||||||
|
input_tokens,
|
||||||
|
module,
|
||||||
|
)
|
||||||
|
elif node.target is torch.ops.higher_order.invoke_subgraph:
|
||||||
|
_replace_invoke_subgraph_node(
|
||||||
|
node, module, output_tokens, input_tokens
|
||||||
|
)
|
||||||
|
|
||||||
|
# Remove tokens from the output node
|
||||||
|
if len(output_tokens) > 0:
|
||||||
|
output_node = next(reversed(module.graph.find_nodes(op="output")))
|
||||||
|
output_args = output_node.args[0]
|
||||||
|
assert len(output_args) >= len(output_tokens), (
|
||||||
|
f"{output_args} output arguments found\n"
|
||||||
|
f"{output_tokens} output tokens found\n"
|
||||||
|
f"{module.graph}"
|
||||||
|
)
|
||||||
|
output_node.args = (tuple(output_args[len(output_tokens) :]),)
|
||||||
|
|
||||||
|
module.graph.eliminate_dead_code()
|
||||||
|
|
||||||
|
# Remove tokens from the input placeholders
|
||||||
|
for node in module.graph.nodes:
|
||||||
|
if node.op == "placeholder" and node in input_tokens:
|
||||||
|
module.graph.erase_node(node)
|
||||||
|
|
||||||
|
module.recompile()
|
||||||
|
|
||||||
num_tokens: int = 0
|
num_tokens: int = 0
|
||||||
input_token_names: list[str] = []
|
input_token_names: list[str] = []
|
||||||
new_input_specs: list[InputSpec] = []
|
new_input_specs: list[InputSpec] = []
|
||||||
@ -159,9 +210,5 @@ def _remove_effect_tokens(ep: ExportedProgram) -> ExportedProgram:
|
|||||||
|
|
||||||
assert num_tokens == num_out_tokens
|
assert num_tokens == num_out_tokens
|
||||||
|
|
||||||
with ep.graph_module._set_replace_hook(ep.graph_signature.get_replace_hook()):
|
print("after", ep)
|
||||||
_remove_effect_tokens_from_graph_helper(
|
|
||||||
ep, num_tokens, input_token_names, output_token_names
|
|
||||||
)
|
|
||||||
|
|
||||||
return ep
|
return ep
|
||||||
|
|||||||
@ -748,11 +748,23 @@ def _unlift_exported_program_lifted_states(
|
|||||||
) -> torch.fx.GraphModule:
|
) -> torch.fx.GraphModule:
|
||||||
check_guards = check_guards and _ok_to_generate_guards_fn()
|
check_guards = check_guards and _ok_to_generate_guards_fn()
|
||||||
|
|
||||||
|
source_node_dict = {
|
||||||
|
node.name: node for node in ep.graph.nodes if node.op != "placeholder"
|
||||||
|
}
|
||||||
|
# placeholder node name might change after deepcopy
|
||||||
|
placeholder_source_node_dict = {
|
||||||
|
node.target: node for node in ep.graph.nodes if node.op == "placeholder"
|
||||||
|
}
|
||||||
|
|
||||||
|
new_gm = torch.fx.GraphModule(ep.graph_module, copy.deepcopy(ep.graph))
|
||||||
|
new_gm.meta.update(ep.graph_module.meta)
|
||||||
|
ep = copy.copy(ep)
|
||||||
|
ep._graph_module = new_gm
|
||||||
|
|
||||||
# TODO T206340015
|
# TODO T206340015
|
||||||
if ep.verifiers[0].dialect != "TRAINING":
|
if ep.verifiers[0].dialect != "TRAINING":
|
||||||
ep = _remove_effect_tokens(ep)
|
ep = _remove_effect_tokens(ep)
|
||||||
|
|
||||||
new_gm = torch.fx.GraphModule(ep.graph_module, copy.deepcopy(ep.graph))
|
|
||||||
_register_attrs_to_new_gm(new_gm, ep.graph_signature, ep.state_dict, ep.constants)
|
_register_attrs_to_new_gm(new_gm, ep.graph_signature, ep.state_dict, ep.constants)
|
||||||
forward_arg_names = (
|
forward_arg_names = (
|
||||||
sig.forward_arg_names if (sig := ep.module_call_graph[0].signature) else None
|
sig.forward_arg_names if (sig := ep.module_call_graph[0].signature) else None
|
||||||
@ -786,19 +798,13 @@ def _unlift_exported_program_lifted_states(
|
|||||||
for out_spec in ep.graph_signature.output_specs
|
for out_spec in ep.graph_signature.output_specs
|
||||||
]
|
]
|
||||||
|
|
||||||
source_node_dict = {
|
|
||||||
node.name: node for node in ep.graph.nodes if node.op != "placeholder"
|
|
||||||
}
|
|
||||||
# placeholder node name might change after deepcopy
|
|
||||||
placeholder_source_node_dict = {
|
|
||||||
node.target: node for node in ep.graph.nodes if node.op == "placeholder"
|
|
||||||
}
|
|
||||||
for node in new_gm.graph.nodes:
|
for node in new_gm.graph.nodes:
|
||||||
source_node = None
|
source_node = None
|
||||||
if node.op == "placeholder":
|
if node.op == "placeholder":
|
||||||
source_node = placeholder_source_node_dict.get(node.target)
|
source_node = placeholder_source_node_dict.get(node.target)
|
||||||
else:
|
else:
|
||||||
source_node = source_node_dict.get(node.name)
|
if node.name in source_node_dict:
|
||||||
|
source_node = source_node_dict.get(node.name)
|
||||||
node.meta["from_node"] = [
|
node.meta["from_node"] = [
|
||||||
NodeSource(
|
NodeSource(
|
||||||
source_node,
|
source_node,
|
||||||
|
|||||||
@ -753,7 +753,9 @@ class Node(_NodeBase):
|
|||||||
# between eager and compiled execution, regardless of generator usage
|
# between eager and compiled execution, regardless of generator usage
|
||||||
return True
|
return True
|
||||||
|
|
||||||
return self.target in _side_effectful_functions
|
from torch._higher_order_ops.effects import has_effects
|
||||||
|
|
||||||
|
return self.target in _side_effectful_functions or has_effects(self.target)
|
||||||
|
|
||||||
# Check if an impure module.
|
# Check if an impure module.
|
||||||
if self.op == "call_module":
|
if self.op == "call_module":
|
||||||
|
|||||||
@ -39,6 +39,10 @@ def manual_seed(seed) -> torch._C.Generator:
|
|||||||
is raised. Negative inputs are remapped to positive values with the formula
|
is raised. Negative inputs are remapped to positive values with the formula
|
||||||
`0xffff_ffff_ffff_ffff + seed`.
|
`0xffff_ffff_ffff_ffff + seed`.
|
||||||
"""
|
"""
|
||||||
|
return _manual_seed_impl(seed, update_local_tensor_states=True)
|
||||||
|
|
||||||
|
|
||||||
|
def _manual_seed_impl(seed, update_local_tensor_states) -> torch._C.Generator:
|
||||||
seed = int(seed)
|
seed = int(seed)
|
||||||
import torch.cuda
|
import torch.cuda
|
||||||
|
|
||||||
|
|||||||
@ -724,6 +724,9 @@ class LocalDTensorTestBase(DTensorTestBase):
|
|||||||
torch.autograd._enable_record_function(False)
|
torch.autograd._enable_record_function(False)
|
||||||
|
|
||||||
def tearDown(self) -> None:
|
def tearDown(self) -> None:
|
||||||
|
from torch.distributed.tensor import _random as random
|
||||||
|
|
||||||
|
random._rng_tracker = None
|
||||||
super().tearDown()
|
super().tearDown()
|
||||||
torch.autograd._enable_record_function(True)
|
torch.autograd._enable_record_function(True)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user