mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-12 06:44:55 +08:00
Compare commits
28 Commits
v2.9.0
...
cherry-pic
| Author | SHA1 | Date | |
|---|---|---|---|
| bfce8dd73a | |||
| 9976b77abb | |||
| e6bcbbe17c | |||
| 8f658d7599 | |||
| 3d27d955fd | |||
| a06141f73d | |||
| 5b9f040d0e | |||
| 49046e0e4f | |||
| 4aca6a7110 | |||
| 6bc3d6fcd6 | |||
| ba8639586b | |||
| f190bda17a | |||
| 8e83e24d7f | |||
| 13f1b551b0 | |||
| 38e8ba6ecc | |||
| 76335d8125 | |||
| fc5612a499 | |||
| d29deefa9e | |||
| 593377555e | |||
| e0c8ff1b8a | |||
| 3dead93453 | |||
| e2f6f8c079 | |||
| 32e37e6b9d | |||
| cbe1a35dbd | |||
| 9315f44cd6 | |||
| e9e3db62fe | |||
| c19082674b | |||
| 4dca449358 |
@ -8,6 +8,8 @@ if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
|
||||
export TORCH_CUDA_ARCH_LIST="8.0;9.0"
|
||||
elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
|
||||
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
|
||||
elif [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
|
||||
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
|
||||
elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
|
||||
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX"
|
||||
fi
|
||||
|
||||
@ -1 +1 @@
|
||||
bbb06c0334a6772b92d24bde54956e675c8c6604
|
||||
bfeb066872bc1e8b2d2bc0a3b295b99dd77206e7
|
||||
|
||||
@ -1 +1 @@
|
||||
3.5.0
|
||||
3.5.1
|
||||
|
||||
@ -187,19 +187,22 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
|
||||
export USE_CUFILE=0
|
||||
else
|
||||
DEPS_LIST+=(
|
||||
"/usr/local/cuda/lib64/libnvToolsExt.so.1"
|
||||
"/usr/local/cuda/lib64/libcublas.so.12"
|
||||
"/usr/local/cuda/lib64/libcublasLt.so.12"
|
||||
"/usr/local/cuda/lib64/libcudart.so.12"
|
||||
"/usr/local/cuda/lib64/libnvrtc.so.12"
|
||||
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12")
|
||||
DEPS_SONAME+=(
|
||||
"libnvToolsExt.so.1"
|
||||
"libcublas.so.12"
|
||||
"libcublasLt.so.12"
|
||||
"libcudart.so.12"
|
||||
"libnvrtc.so.12"
|
||||
"libcupti.so.12")
|
||||
|
||||
if [[ $CUDA_VERSION != 12.9* ]]; then
|
||||
DEPS_LIST+=("/usr/local/cuda/lib64/libnvToolsExt.so.1")
|
||||
DEPS_SONAME+=("libnvToolsExt.so.1")
|
||||
fi
|
||||
fi
|
||||
else
|
||||
echo "Using nvidia libs from pypi."
|
||||
|
||||
38
.github/scripts/generate_binary_build_matrix.py
vendored
38
.github/scripts/generate_binary_build_matrix.py
vendored
@ -16,16 +16,18 @@ from typing import Optional
|
||||
|
||||
|
||||
# NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this
|
||||
CUDA_ARCHES = ["12.6", "12.8", "13.0"]
|
||||
CUDA_ARCHES = ["12.6", "12.8", "12.9", "13.0"]
|
||||
CUDA_STABLE = "12.8"
|
||||
CUDA_ARCHES_FULL_VERSION = {
|
||||
"12.6": "12.6.3",
|
||||
"12.8": "12.8.1",
|
||||
"12.9": "12.9.1",
|
||||
"13.0": "13.0.0",
|
||||
}
|
||||
CUDA_ARCHES_CUDNN_VERSION = {
|
||||
"12.6": "9",
|
||||
"12.8": "9",
|
||||
"12.9": "9",
|
||||
"13.0": "9",
|
||||
}
|
||||
|
||||
@ -38,7 +40,7 @@ CPU_AARCH64_ARCH = ["cpu-aarch64"]
|
||||
|
||||
CPU_S390X_ARCH = ["cpu-s390x"]
|
||||
|
||||
CUDA_AARCH64_ARCHES = ["12.6-aarch64", "12.8-aarch64", "13.0-aarch64"]
|
||||
CUDA_AARCH64_ARCHES = ["12.6-aarch64", "12.8-aarch64", "12.9-aarch64", "13.0-aarch64"]
|
||||
|
||||
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
@ -76,6 +78,23 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
"nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | "
|
||||
"nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'"
|
||||
),
|
||||
"12.9": (
|
||||
"nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | "
|
||||
"nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | "
|
||||
"nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | "
|
||||
"nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | "
|
||||
"nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | "
|
||||
"nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | "
|
||||
"nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | "
|
||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
|
||||
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
|
||||
"nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | "
|
||||
"nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | "
|
||||
"nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | "
|
||||
"nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'"
|
||||
),
|
||||
"13.0": (
|
||||
"nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | "
|
||||
"nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | "
|
||||
@ -222,7 +241,11 @@ def generate_libtorch_matrix(
|
||||
arches += CUDA_ARCHES
|
||||
arches += ROCM_ARCHES
|
||||
elif os == "windows":
|
||||
arches += CUDA_ARCHES
|
||||
# TODO (huydhn): Only build CUDA 12.9 for Linux. This logic is to be cleaned up
|
||||
# in 2.10
|
||||
windows_cuda_arches = CUDA_ARCHES.copy()
|
||||
windows_cuda_arches.remove("12.9")
|
||||
arches += windows_cuda_arches
|
||||
if libtorch_variants is None:
|
||||
libtorch_variants = [
|
||||
"shared-with-deps",
|
||||
@ -286,7 +309,11 @@ def generate_wheels_matrix(
|
||||
if os == "linux":
|
||||
arches += CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
|
||||
elif os == "windows":
|
||||
arches += CUDA_ARCHES + XPU_ARCHES
|
||||
# TODO (huydhn): Only build CUDA 12.9 for Linux. This logic is to be cleaned up
|
||||
# in 2.10
|
||||
windows_cuda_arches = CUDA_ARCHES.copy()
|
||||
windows_cuda_arches.remove("12.9")
|
||||
arches += windows_cuda_arches + XPU_ARCHES
|
||||
elif os == "linux-aarch64":
|
||||
# Separate new if as the CPU type is different and
|
||||
# uses different build/test scripts
|
||||
@ -322,7 +349,7 @@ def generate_wheels_matrix(
|
||||
# cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
|
||||
|
||||
if (
|
||||
arch_version in ["13.0", "12.8", "12.6"]
|
||||
arch_version in ["13.0", "12.9", "12.8", "12.6"]
|
||||
and os == "linux"
|
||||
or arch_version in CUDA_AARCH64_ARCHES
|
||||
):
|
||||
@ -386,5 +413,6 @@ def generate_wheels_matrix(
|
||||
|
||||
|
||||
validate_nccl_dep_consistency("13.0")
|
||||
validate_nccl_dep_consistency("12.9")
|
||||
validate_nccl_dep_consistency("12.8")
|
||||
validate_nccl_dep_consistency("12.6")
|
||||
|
||||
4
.github/workflows/build-manywheel-images.yml
vendored
4
.github/workflows/build-manywheel-images.yml
vendored
@ -46,10 +46,12 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include: [
|
||||
{ name: "manylinux2_28-builder", tag: "cuda13.0", runner: "linux.9xlarge.ephemeral" },
|
||||
{ name: "manylinux2_28-builder", tag: "cuda13.0", runner: "linux.9xlarge.ephemeral" },
|
||||
{ name: "manylinux2_28-builder", tag: "cuda12.8", runner: "linux.9xlarge.ephemeral" },
|
||||
{ name: "manylinux2_28-builder", tag: "cuda12.9", runner: "linux.9xlarge.ephemeral" },
|
||||
{ name: "manylinux2_28-builder", tag: "cuda12.6", runner: "linux.9xlarge.ephemeral" },
|
||||
{ name: "manylinuxaarch64-builder", tag: "cuda13.0", runner: "linux.arm64.2xlarge.ephemeral" },
|
||||
{ name: "manylinuxaarch64-builder", tag: "cuda12.9", runner: "linux.arm64.2xlarge.ephemeral" },
|
||||
{ name: "manylinuxaarch64-builder", tag: "cuda12.8", runner: "linux.arm64.2xlarge.ephemeral" },
|
||||
{ name: "manylinuxaarch64-builder", tag: "cuda12.6", runner: "linux.arm64.2xlarge.ephemeral" },
|
||||
{ name: "manylinux2_28-builder", tag: "rocm6.3", runner: "linux.9xlarge.ephemeral" },
|
||||
|
||||
322
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
322
.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
generated
vendored
@ -204,6 +204,52 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_10-cuda-aarch64-12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda-aarch64-12_9-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_10-cuda-aarch64-12_9-build
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cuda-aarch64-12_9
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_10-cuda-aarch64-13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -407,6 +453,52 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_11-cuda-aarch64-12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda-aarch64-12_9-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_11-cuda-aarch64-12_9-build
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cuda-aarch64-12_9
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_11-cuda-aarch64-13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -610,6 +702,52 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_12-cuda-aarch64-12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.12"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda-aarch64-12_9-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_12-cuda-aarch64-12_9-build
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cuda-aarch64-12_9
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_12-cuda-aarch64-13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -813,6 +951,52 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_13-cuda-aarch64-12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.13"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda-aarch64-12_9-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_13-cuda-aarch64-12_9-build
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.13"
|
||||
build_name: manywheel-py3_13-cuda-aarch64-12_9
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_13-cuda-aarch64-13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -1016,6 +1200,52 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_13t-cuda-aarch64-12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.13t"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda-aarch64-12_9-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_13t-cuda-aarch64-12_9-build
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.13t"
|
||||
build_name: manywheel-py3_13t-cuda-aarch64-12_9
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_13t-cuda-aarch64-13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -1219,6 +1449,52 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_14-cuda-aarch64-12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.14"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14-cuda-aarch64-12_9-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_14-cuda-aarch64-12_9-build
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.14"
|
||||
build_name: manywheel-py3_14-cuda-aarch64-12_9
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_14-cuda-aarch64-13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -1422,6 +1698,52 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_14t-cuda-aarch64-12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.14t"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.arm64.m7g.4xlarge.ephemeral
|
||||
ALPINE_IMAGE: "arm64v8/alpine"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-12_9
|
||||
build_environment: linux-aarch64-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
timeout-minutes: 420
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14t-cuda-aarch64-12_9-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_14t-cuda-aarch64-12_9-build
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9-aarch64"
|
||||
GPU_ARCH_TYPE: cuda-aarch64
|
||||
DOCKER_IMAGE: manylinuxaarch64-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.14t"
|
||||
build_name: manywheel-py3_14t-cuda-aarch64-12_9
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_14t-cuda-aarch64-13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
|
||||
68
.github/workflows/generated-linux-binary-libtorch-nightly.yml
generated
vendored
68
.github/workflows/generated-linux-binary-libtorch-nightly.yml
generated
vendored
@ -248,6 +248,74 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
libtorch-cuda12_9-shared-with-deps-release-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: libtorch-cxx11-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: libtorch-cuda12_9-shared-with-deps-release
|
||||
build_environment: linux-binary-libtorch
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
libtorch-cuda12_9-shared-with-deps-release-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- libtorch-cuda12_9-shared-with-deps-release-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: libtorch-cxx11-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
build_name: libtorch-cuda12_9-shared-with-deps-release
|
||||
build_environment: linux-binary-libtorch
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
libtorch-cuda12_9-shared-with-deps-release-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: libtorch-cuda12_9-shared-with-deps-release-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: libtorch
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: libtorch-cxx11-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
LIBTORCH_CONFIG: release
|
||||
LIBTORCH_VARIANT: shared-with-deps
|
||||
build_name: libtorch-cuda12_9-shared-with-deps-release
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
libtorch-cuda13_0-shared-with-deps-release-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
|
||||
462
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
462
.github/workflows/generated-linux-binary-manywheel-nightly.yml
generated
vendored
@ -241,6 +241,72 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_10-cuda12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.10"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_10-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda12_9-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_10-cuda12_9-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_10-cuda12_9-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_10-cuda12_9-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.10"
|
||||
build_name: manywheel-py3_10-cuda12_9
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_10-cuda13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -832,6 +898,72 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_11-cuda12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.11"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_11-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_9-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_11-cuda12_9-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_11-cuda12_9-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_11-cuda12_9-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.11"
|
||||
build_name: manywheel-py3_11-cuda12_9
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_11-cuda13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -1423,6 +1555,72 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_12-cuda12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.12"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_12-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda12_9-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_12-cuda12_9-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_12-cuda12_9-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_12-cuda12_9-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.12"
|
||||
build_name: manywheel-py3_12-cuda12_9
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_12-cuda13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -2014,6 +2212,72 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_13-cuda12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.13"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda12_9-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_13-cuda12_9-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.13"
|
||||
build_name: manywheel-py3_13-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13-cuda12_9-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_13-cuda12_9-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.13"
|
||||
build_name: manywheel-py3_13-cuda12_9
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_13-cuda13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -2605,6 +2869,72 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_13t-cuda12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.13t"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_13t-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda12_9-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_13t-cuda12_9-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.13t"
|
||||
build_name: manywheel-py3_13t-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_13t-cuda12_9-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_13t-cuda12_9-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.13t"
|
||||
build_name: manywheel-py3_13t-cuda12_9
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_13t-cuda13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -3196,6 +3526,72 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_14-cuda12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.14"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14-cuda12_9-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_14-cuda12_9-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.14"
|
||||
build_name: manywheel-py3_14-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14-cuda12_9-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_14-cuda12_9-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.14"
|
||||
build_name: manywheel-py3_14-cuda12_9
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_14-cuda13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
@ -3787,6 +4183,72 @@ jobs:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_14t-cuda12_9-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.14t"
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build_name: manywheel-py3_14t-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14t-cuda12_9-test: # Testing
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs:
|
||||
- manywheel-py3_14t-cuda12_9-build
|
||||
- get-label-type
|
||||
uses: ./.github/workflows/_binary-test-linux.yml
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.14t"
|
||||
build_name: manywheel-py3_14t-cuda12_9
|
||||
build_environment: linux-binary-manywheel
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
manywheel-py3_14t-cuda12_9-upload: # Uploading
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
needs: manywheel-py3_14t-cuda12_9-test
|
||||
with:
|
||||
PYTORCH_ROOT: /pytorch
|
||||
PACKAGE_TYPE: manywheel
|
||||
# TODO: This is a legacy variable that we eventually want to get rid of in
|
||||
# favor of GPU_ARCH_VERSION
|
||||
DESIRED_CUDA: cu129
|
||||
GPU_ARCH_VERSION: "12.9"
|
||||
GPU_ARCH_TYPE: cuda
|
||||
DOCKER_IMAGE: manylinux2_28-builder
|
||||
DOCKER_IMAGE_TAG_PREFIX: cuda12.9
|
||||
DESIRED_PYTHON: "3.14t"
|
||||
build_name: manywheel-py3_14t-cuda12_9
|
||||
secrets:
|
||||
github-token: ${{ secrets.GITHUB_TOKEN }}
|
||||
uses: ./.github/workflows/_binary-upload.yml
|
||||
|
||||
manywheel-py3_14t-cuda13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
uses: ./.github/workflows/_binary-build-linux.yml
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||

|
||||

|
||||
|
||||
--------------------------------------------------------------------------------
|
||||
|
||||
@ -72,7 +72,7 @@ Elaborating Further:
|
||||
|
||||
If you use NumPy, then you have used Tensors (a.k.a. ndarray).
|
||||
|
||||

|
||||

|
||||
|
||||
PyTorch provides Tensors that can live either on the CPU or the GPU and accelerates the
|
||||
computation by a huge amount.
|
||||
@ -99,7 +99,7 @@ from several research papers on this topic, as well as current and past work suc
|
||||
While this technique is not unique to PyTorch, it's one of the fastest implementations of it to date.
|
||||
You get the best of speed and flexibility for your crazy research.
|
||||
|
||||

|
||||

|
||||
|
||||
### Python First
|
||||
|
||||
|
||||
@ -24,7 +24,6 @@ C10_DIAGNOSTIC_POP()
|
||||
namespace at {
|
||||
|
||||
namespace {
|
||||
|
||||
/*
|
||||
These const variables defined the fp32 precisions for different backend
|
||||
We have "generic", "cuda", "mkldnn" backend now and we can choose fp32
|
||||
@ -76,14 +75,6 @@ void check_fp32_prec_backend_and_op(
|
||||
return valid;
|
||||
}
|
||||
|
||||
C10_ALWAYS_INLINE void warn_deprecated_fp32_precision_api(){
|
||||
TORCH_WARN_ONCE(
|
||||
"Please use the new API settings to control TF32 behavior, such as torch.backends.cudnn.conv.fp32_precision = 'tf32' "
|
||||
"or torch.backends.cuda.matmul.fp32_precision = 'ieee'. Old settings, e.g, torch.backends.cuda.matmul.allow_tf32 = True, "
|
||||
"torch.backends.cudnn.allow_tf32 = True, allowTF32CuDNN() and allowTF32CuBLAS() will be deprecated after Pytorch 2.9. Please see "
|
||||
"https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices"
|
||||
);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
Context::Context() = default;
|
||||
@ -193,7 +184,6 @@ bool Context::allowTF32CuDNN(const std::string& op) const {
|
||||
} else {
|
||||
return float32Precision("cuda", op) == "tf32";
|
||||
}
|
||||
warn_deprecated_fp32_precision_api();
|
||||
return allow_tf32_cudnn;
|
||||
}
|
||||
|
||||
@ -201,7 +191,6 @@ void Context::setAllowTF32CuDNN(bool b) {
|
||||
setFloat32Precision("cuda", "rnn", b ? "tf32" : "none");
|
||||
setFloat32Precision("cuda", "conv", b ? "tf32" : "none");
|
||||
allow_tf32_cudnn = b;
|
||||
warn_deprecated_fp32_precision_api();
|
||||
}
|
||||
|
||||
void Context::setSDPPriorityOrder(const std::vector<int64_t>& order) {
|
||||
@ -357,7 +346,6 @@ bool Context::allowTF32CuBLAS() const {
|
||||
"Current status indicate that you have used mix of the legacy and new APIs to set the TF32 status for cublas matmul. ",
|
||||
"We suggest only using the new API to set the TF32 flag. See also: ",
|
||||
"https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices");
|
||||
warn_deprecated_fp32_precision_api();
|
||||
return allow_tf32_new;
|
||||
}
|
||||
|
||||
@ -389,7 +377,6 @@ Float32MatmulPrecision Context::float32MatmulPrecision() const {
|
||||
"Current status indicate that you have used mix of the legacy and new APIs to set the matmul precision. ",
|
||||
"We suggest only using the new API for matmul precision. See also: ",
|
||||
"https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices");
|
||||
warn_deprecated_fp32_precision_api();
|
||||
return float32_matmul_precision;
|
||||
}
|
||||
|
||||
@ -406,7 +393,6 @@ std::string Context::float32Precision(const std::string& backend, const std::str
|
||||
|
||||
void Context::setFloat32MatmulPrecision(const std::string &s) {
|
||||
auto match = [this](const std::string & s_) {
|
||||
warn_deprecated_fp32_precision_api();
|
||||
// TODO: consider if CuDNN field needs to also be set for potential future CuDNN ops like multi-headed attention
|
||||
if (s_ == "highest") {
|
||||
float32_matmul_precision = at::Float32MatmulPrecision::HIGHEST;
|
||||
|
||||
@ -155,6 +155,12 @@ class TORCH_API Context {
|
||||
static long versionCuDNN() {
|
||||
return detail::getCUDAHooks().versionCuDNN();
|
||||
}
|
||||
static long versionRuntimeCuDNN() {
|
||||
return detail::getCUDAHooks().versionRuntimeCuDNN();
|
||||
}
|
||||
static long versionCuDNNFrontend() {
|
||||
return detail::getCUDAHooks().versionCuDNNFrontend();
|
||||
}
|
||||
static bool hasCuSOLVER() {
|
||||
return detail::getCUDAHooks().hasCuSOLVER();
|
||||
}
|
||||
|
||||
@ -21,6 +21,7 @@
|
||||
|
||||
#if AT_CUDNN_ENABLED()
|
||||
#include <ATen/cudnn/cudnn-wrapper.h>
|
||||
#include <cudnn_frontend.h>
|
||||
#endif
|
||||
|
||||
#if AT_MAGMA_ENABLED()
|
||||
@ -325,6 +326,26 @@ long CUDAHooks::versionCuDNN() const {
|
||||
#endif
|
||||
}
|
||||
|
||||
long CUDAHooks::versionRuntimeCuDNN() const {
|
||||
#if AT_CUDNN_ENABLED()
|
||||
#ifndef USE_STATIC_CUDNN
|
||||
return cudnnGetVersion();
|
||||
#else
|
||||
return CUDNN_VERSION;
|
||||
#endif
|
||||
#else
|
||||
TORCH_CHECK(false, "Cannot query CuDNN version if ATen_cuda is not built with CuDNN");
|
||||
#endif
|
||||
}
|
||||
|
||||
long CUDAHooks::versionCuDNNFrontend() const {
|
||||
#if AT_CUDNN_ENABLED()
|
||||
return CUDNN_FRONTEND_VERSION;
|
||||
#else
|
||||
TORCH_CHECK(false, "Cannot query CuDNN Frontend version if ATen_cuda is not built with CuDNN");
|
||||
#endif
|
||||
}
|
||||
|
||||
long CUDAHooks::versionMIOpen() const {
|
||||
#if AT_ROCM_ENABLED()
|
||||
return MIOPEN_VERSION_MAJOR * 10000 +
|
||||
|
||||
@ -48,6 +48,8 @@ struct CUDAHooks : public at::CUDAHooksInterface {
|
||||
bool hasCUDART() const override;
|
||||
long versionCUDART() const override;
|
||||
long versionCuDNN() const override;
|
||||
long versionRuntimeCuDNN() const override;
|
||||
long versionCuDNNFrontend() const override;
|
||||
long versionMIOpen() const override;
|
||||
std::string showConfig() const override;
|
||||
double batchnormMinEpsilonCuDNN() const override;
|
||||
|
||||
@ -170,6 +170,14 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
|
||||
TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
|
||||
}
|
||||
|
||||
virtual long versionRuntimeCuDNN() const {
|
||||
TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
|
||||
}
|
||||
|
||||
virtual long versionCuDNNFrontend() const {
|
||||
TORCH_CHECK(false, "Cannot query cuDNN Frontend version without ATen_cuda library. ", CUDA_HELP);
|
||||
}
|
||||
|
||||
virtual long versionMIOpen() const {
|
||||
TORCH_CHECK(false, "Cannot query MIOpen version without ATen_cuda library. ", CUDA_HELP);
|
||||
}
|
||||
|
||||
@ -413,9 +413,9 @@ struct ConvParams {
|
||||
if (!detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda() || !cudnn_enabled) {
|
||||
return false;
|
||||
}
|
||||
static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
|
||||
// broken on cuDNN 9.8
|
||||
if (cudnn_version >= 90800) {
|
||||
static long cudnn_version = detail::getCUDAHooks().versionRuntimeCuDNN();
|
||||
// broken on cuDNN 9.8 - 9.14
|
||||
if (cudnn_version >= 90800 && cudnn_version < 91500) {
|
||||
if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous &&
|
||||
(input.scalar_type() == at::kBFloat16 || input.scalar_type() == at::kHalf) &&
|
||||
weight.dim() == 5) {
|
||||
@ -457,7 +457,7 @@ struct ConvParams {
|
||||
}
|
||||
// native kernel doesn't support 64-bit non-splittable case
|
||||
if (!(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) {
|
||||
static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionCuDNN() : -1;
|
||||
static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionRuntimeCuDNN() : -1;
|
||||
// TODO(eqy): remove this once cuDNN fixes 64-bit depthwise support, first broken in 9.11x
|
||||
if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) {
|
||||
if (cudnn_version < 0 || cudnn_version > 91000) {
|
||||
|
||||
@ -73,7 +73,6 @@ void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, co
|
||||
|
||||
char* const out_ptr = static_cast<char*>(iter.data_ptr(0));
|
||||
char* const in_ptr = static_cast<char*>(iter.data_ptr(1));
|
||||
|
||||
if (is_gather_like && num_indices==1) {
|
||||
const size_t element_size = iter.element_size(0);
|
||||
constexpr size_t alignment = 16;
|
||||
@ -83,11 +82,10 @@ void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, co
|
||||
auto ind_dim_size = index_size[0];
|
||||
auto inp_stride_bytes = index_stride[0];
|
||||
auto out_stride_bytes = iter.strides(0)[1];
|
||||
if (iter.numel() == 0) return;
|
||||
at::native::vectorized_gather_kernel_launch<alignment, int64_t>(out_ptr, in_ptr, (int64_t*)iter.data_ptr(2), num_ind,
|
||||
slice_size, ind_dim_size, inp_stride_bytes, out_stride_bytes, /*allow_neg_indices*/true);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto sizes = std::array<int64_t, MAX_DIMS>{};
|
||||
|
||||
@ -14,10 +14,11 @@ __global__ void vectorized_gather_kernel(char * out, char * inp, index_t * idx,
|
||||
ind = (ind < 0) ? ind + ind_dim_size : ind;
|
||||
}
|
||||
CUDA_KERNEL_ASSERT(ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds");
|
||||
int32_t off = (blockDim.x * blockIdx.y + threadIdx.x) * Alignment; // off is guaranteed to be within int32 limits
|
||||
if (off >= slice_size) return;
|
||||
auto vec = at::native::memory::ld_vec<Alignment>(inp + ind * inp_stride + off);
|
||||
at::native::memory::st_vec<Alignment>(out + blockIdx.x * (int32_t)out_stride + off, vec); // out offset is guaranteed to be within int32 limits
|
||||
// off is guaranteed to be within int32 limits
|
||||
for (int32_t off = (blockDim.x * blockIdx.y + threadIdx.x) * Alignment; off < slice_size; off += blockDim.x * gridDim.y * Alignment) {
|
||||
auto vec = at::native::memory::ld_vec<Alignment>(inp + ind * inp_stride + off);
|
||||
at::native::memory::st_vec<Alignment>(out + blockIdx.x * (int32_t)out_stride + off, vec); // out offset is guaranteed to be within int32 limits
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -30,7 +31,9 @@ void vectorized_gather_kernel_launch(char * out, char * inp, index_t * idx, int
|
||||
auto num_threads = at::round_up(
|
||||
at::ceil_div(slice_size_in_bytes, Alignment),
|
||||
static_cast<int64_t>(C10_WARP_SIZE));
|
||||
dim3 grid = {static_cast<uint32_t>(num_ind), static_cast<uint32_t>(at::ceil_div(slice_size_in_bytes, max_num_threads * Alignment)), 1};
|
||||
uint32_t grid_y = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
|
||||
grid_y = std::min(static_cast<uint32_t>(at::ceil_div(slice_size_in_bytes, max_num_threads * Alignment)), grid_y);
|
||||
dim3 grid = {static_cast<uint32_t>(num_ind), grid_y, 1};
|
||||
auto block = std::min(max_num_threads, num_threads);
|
||||
vectorized_gather_kernel<Alignment, index_t><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(out, inp, idx, num_ind, slice_size_in_bytes,
|
||||
ind_dim_size, inp_stride_bytes, out_stride_bytes, allow_neg_indices);
|
||||
|
||||
@ -437,7 +437,7 @@ bool check_cudnn_tensor_shapes(sdp_params const& params, bool debug) {
|
||||
const auto s_k = params.key.sym_size(2);
|
||||
const auto d_qk = params.query.sym_size(3);
|
||||
const auto d_v = params.value.sym_size(3);
|
||||
long cudnn_version = at::detail::getCUDAHooks().versionCuDNN();
|
||||
long cudnn_version = at::detail::getCUDAHooks().versionRuntimeCuDNN();
|
||||
if (cudnn_version < 8903) {
|
||||
if (debug) {
|
||||
TORCH_WARN("SDPA fprop requires cudnn 8.9.3 or higher");
|
||||
@ -668,7 +668,7 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
|
||||
return false;
|
||||
#endif
|
||||
#if defined(CUDNN_VERSION)
|
||||
static auto cudnn_version = cudnnGetVersion();
|
||||
static auto cudnn_version = at::detail::getCUDAHooks().versionRuntimeCuDNN();
|
||||
if (params.dropout > 0.0 && cudnn_version > 91100 && cudnn_version < 91400) {
|
||||
if (debug) {
|
||||
TORCH_WARN(CUDNN_VERSION, " cuDNN version does not support droppout in SDPA (9.11 - 9.13).");
|
||||
|
||||
@ -1,6 +1,173 @@
|
||||
# LibTorch Stable ABI
|
||||
|
||||
This note will eventually contain more details on how to use the APIs in torch/csrc/stable. For the moment, it contains a table of internal representations:
|
||||
## Overview
|
||||
|
||||
The LibTorch Stable ABI (Application Binary Interface) provides a limited interface for extending PyTorch functionality without being tightly coupled to specific PyTorch versions. This enables the development of custom operators and extensions that remain compatible across PyTorch releases. This limited set of APIs is not intended to replace existing LibTorch, but rather to provide a stable foundation for a majority of custom extension use cases. If there is any API you would like to see added to the stable ABI, please file a request through a [new issue on the PyTorch repo](https://github.com/pytorch/pytorch/issues).
|
||||
|
||||
The limited stable ABI consists of three main components:
|
||||
|
||||
1. **Stable C headers** - Low-level C API implemented by libtorch (primarily `torch/csrc/inductor/aoti_torch/c/shim.h`)
|
||||
2. **Header-only C++ library** - Standalone utilities implemented in only headers such that there is no dependence on libtorch (`torch/headeronly/*`)
|
||||
3. **Stable C++ wrappers** - High-level C++ convenience wrappers (`torch/csrc/stable/*`)
|
||||
|
||||
We discuss each of these in detail
|
||||
|
||||
### `torch/headeronly`
|
||||
|
||||
The inlined C++ headers living in [`torch/headeronly`](https://github.com/pytorch/pytorch/tree/main/torch/headeronly) are completely decoupled from LibTorch. The headers consist of certain utilities that might be familiar to custom extension writers. For example, the
|
||||
`c10::ScalarType` enum lives here as `torch::headeronly::ScalarType`, as well as a libtorch-independent version of `TORCH_CHECK` that is `STD_TORCH_CHECK`. You can trust all APIs in the `torch::headeronly` namespace to not depend on `libtorch.so`. These APIs are also globally listed in [torch/header_only_apis.txt](https://github.com/pytorch/pytorch/blob/main/torch/header_only_apis.txt).
|
||||
|
||||
### `torch/csrc/stable`
|
||||
|
||||
This is a set of inlined C++ headers that provide wrappers around the C API that handle the rough edges
|
||||
discussed below.
|
||||
|
||||
It consists of
|
||||
|
||||
- torch/csrc/stable/library.h: Provides a stable version of TORCH_LIBRARY and similar macros.
|
||||
- torch/csrc/stable/tensor_struct.h: Provides torch::stable::Tensor, a stable version of at::Tensor.
|
||||
- torch/csrc/stable/ops.h: Provides a stable interface for calling ATen ops from `native_functions.yaml`.
|
||||
- torch/csrc/stable/accelerator.h: Provides a stable interface for device-generic objects and APIs
|
||||
(e.g. `getCurrentStream`, `DeviceGuard`).
|
||||
|
||||
We are continuing to improve coverage in our `torch/csrc/stable` APIs. Please file an issue if you'd like to see support for particular APIs in your custom extension.
|
||||
|
||||
### Stable C headers
|
||||
|
||||
The stable C headers started by AOTInductor form the foundation of the stable ABI. Presently, the available C headers include:
|
||||
|
||||
- [torch/csrc/inductor/aoti_torch/c/shim.h](https://github.com/pytorch/pytorch/blob/main/torch/csrc/inductor/aoti_torch/c/shim.h): Includes C-style shim APIs for commonly used regarding Tensors, dtypes, CUDA, and the like.
|
||||
- [torch/csrc/inductor/aoti_torch/generated/c_shim_aten.h](https://github.com/pytorch/pytorch/blob/main/torch/csrc/inductor/aoti_torch/generated/c_shim_aten.h): Includes C-style shim APIs for ATen ops from `native_functions.yaml` (e.g. `aoti_torch_aten_new_empty`).
|
||||
- [torch/csrc/inductor/aoti_torch/generated/c_shim_*.h](https://github.com/pytorch/pytorch/blob/main/torch/csrc/inductor/aoti_torch/generated): Includes C-style shim APIs for specific backend kernels dispatched from `native_functions.yaml` (e.g. `aoti_torch_cuda_pad`). These APIs should only be used for the specific backend they are named after (e.g. `aoti_torch_cuda_pad` should only be used within CUDA kernels), as they opt out of the dispatcher.
|
||||
- [torch/csrc/stable/c/shim.h](https://github.com/pytorch/pytorch/blob/main/torch/csrc/stable/c/shim.h): We are building out more ABIs to logically live in `torch/csrc/stable/c` instead of continuing the AOTI naming that no longer makes sense for our general use case.
|
||||
|
||||
These headers are promised to be ABI stable across releases and adhere to a stronger backwards compatibility policy than LibTorch. Specifically, we promise not to modify them for at least 2 years after they are released. However, this is **use at your own risk**. For example, users must handle the memory lifecycle of objects returned by certain APIs. Further, the stack-based APIs discussed below which allow the user to call into the PyTorch dispatcher do not provide strong guarantees on forward and backward compatibility of the underlying op that is called.
|
||||
|
||||
Unless absolutely necessary, we recommend the high-level C++ API in `torch/csrc/stable`
|
||||
which will handle all the rough edges of the C API for the user.
|
||||
|
||||
## Migrating your kernel to the LibTorch stable ABI
|
||||
|
||||
If you'd like your kernel to be ABI stable with LibTorch, meaning you'd the ability to build for one version and run on another, your kernel must only use the limited stable ABI. This following section goes through some steps of migrating an existing kernel and APIs we imagine you would need to swap over.
|
||||
|
||||
Firstly, instead of registering kernels through `TORCH_LIBRARY`, LibTorch ABI stable kernels must be registered via `STABLE_TORCH_LIBRARY`. Note that, for the time being, implementations registered via `STABLE_TORCH_LIBRARY` must be boxed unlike `TORCH_LIBRARY`. See the simple example below or our docs on [Stack-based APIs](stack-based-apis) for more details. For kernels that are registered via `pybind`, before using the stable ABI, it would be useful to migrate to register them via `TORCH_LIBRARY`.
|
||||
|
||||
While previously your kernels might have included APIs from `<torch/*.h>` (for example, `<torch/all.h>`), they are now limited to including from the 3 categories of headers mentioned above (`torch/csrc/stable/*.h`, `torch/headeronly/*.h` and the stable C headers). This means that your extension should no longer use any utilities from the `at::` or `c10::` namespaces but instead use their replacements in `torch::stable` and `torch::headeronly`. To provide a couple examples of the necessary migrations:
|
||||
- all uses of `at::Tensor` must be replaced with `torch::stable::Tensor`
|
||||
- all uses of `TORCH_CHECK` must be replaced with `STD_TORCH_CHECK`
|
||||
- all uses of `at::kCUDA` must be replaced with `torch::headeronly::kCUDA` etc.
|
||||
- native functions such as `at::pad` must be replaced with `torch::stable::pad`
|
||||
- native functions that are called as Tensor methods (e.g., `Tensor.pad`) must be replaced with the ATen variant through `torch::stable::pad`.
|
||||
|
||||
As mentioned above, the LibTorch stable ABI is still under development. If there is any API or feature you would like to see added to the stable ABI/`torch::headeronly`/`torch::stable`, please file a request through a [new issue on the PyTorch repo](https://github.com/pytorch/pytorch/issues).
|
||||
|
||||
Below is a simple example of migrating an existing kernel that uses `TORCH_LIBRARY` to the stable ABI (`TORCH_STABLE_LIBRARY`). For a larger end to end example you can take a look at the FA3 repository. Specifically the diff between [`flash_api.cpp`](https://github.com/Dao-AILab/flash-attention/blob/ad70a007e6287d4f7e766f94bcf2f9a813f20f6b/hopper/flash_api.cpp#L1) and the stable variant [`flash_api_stable.cpp`](https://github.com/Dao-AILab/flash-attention/blob/ad70a007e6287d4f7e766f94bcf2f9a813f20f6b/hopper/flash_api_stable.cpp#L1).
|
||||
|
||||
|
||||
### Original Version with `TORCH_LIBRARY`
|
||||
|
||||
```cpp
|
||||
// original_kernel.cpp - Using TORCH_LIBRARY (not stable ABI)
|
||||
#include <torch/torch.h>
|
||||
#include <ATen/ATen.h>
|
||||
|
||||
namespace myops {
|
||||
|
||||
// Simple kernel that adds a scalar value to each element of a tensor
|
||||
at::Tensor add_scalar(const at::Tensor& input, double scalar) {
|
||||
TORCH_CHECK(input.scalar_type() == at::kFloat, "Input must be float32");
|
||||
|
||||
return input.add(scalar);
|
||||
}
|
||||
|
||||
// Register the operator
|
||||
TORCH_LIBRARY(myops, m) {
|
||||
m.def("add_scalar(Tensor input, float scalar) -> Tensor", &add_scalar);
|
||||
}
|
||||
|
||||
// Register the implementation
|
||||
TORCH_LIBRARY_IMPL(myops, CompositeExplicitAutograd, m) {
|
||||
m.impl("add_scalar", &add_scalar);
|
||||
}
|
||||
|
||||
} // namespace myops
|
||||
```
|
||||
|
||||
### Migrated Version with `STABLE_TORCH_LIBRARY`
|
||||
|
||||
```cpp
|
||||
// stable_kernel.cpp - Using STABLE_TORCH_LIBRARY (stable ABI)
|
||||
|
||||
// (1) Don't include <torch/torch.h> <ATen/ATen.h>
|
||||
// only include APIs from torch/csrc/stable, torch/headeronly and C-shims
|
||||
#include <torch/csrc/stable/library.h>
|
||||
#include <torch/csrc/stable/tensor_struct.h>
|
||||
#include <torch/csrc/stable/ops.h>
|
||||
#include <torch/csrc/stable/stableivalue_conversions.h>
|
||||
#include <torch/headeronly/core/ScalarType.h>
|
||||
#include <torch/headeronly/macros/Macros.h>
|
||||
|
||||
namespace myops {
|
||||
|
||||
// Simple kernel that adds a scalar value to each element of a tensor
|
||||
torch::stable::Tensor add_scalar(const torch::stable::Tensor& input, double scalar) {
|
||||
// (2) use STD_TORCH_CHECK instead of TORCH_CHECK
|
||||
STD_TORCH_CHECK(
|
||||
// (3) use torch::headeronly::kFloat instead of at:kFloat
|
||||
input.scalar_type() == torch::headeronly::kFloat,
|
||||
"Input must be float32");
|
||||
|
||||
// (4) Use stable ops namespace instead of input.add
|
||||
return torch::stable::add(input, scalar);
|
||||
}
|
||||
|
||||
// (5) Add Boxed wrapper required for STABLE_TORCH_LIBRARY
|
||||
void boxed_add_scalar(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
// Extract arguments from stack using `to<T>`
|
||||
auto input = to<torch::stable::Tensor>(stack[0]);
|
||||
auto scalar = to<double>(stack[1]);
|
||||
|
||||
// Call the actual kernel
|
||||
auto result = add_scalar(input, scalar);
|
||||
|
||||
// Put result back on stack using `from()`
|
||||
// Stack slot 0 now holds the return value
|
||||
stack[0] = from(result);
|
||||
}
|
||||
|
||||
// (6) Register the operator using STABLE_TORCH_LIBRARY
|
||||
STABLE_TORCH_LIBRARY(myops, m) {
|
||||
m.def("add_scalar(Tensor input, float scalar) -> Tensor", &boxed_add_scalar);
|
||||
}
|
||||
|
||||
// (7) Register the implementation using STABLE_TORCH_LIBRARY_IMPL
|
||||
STABLE_TORCH_LIBRARY_IMPL(myops, CompositeExplicitAutograd, m) {
|
||||
m.impl("add_scalar", &boxed_add_scalar);
|
||||
}
|
||||
|
||||
} // namespace myops
|
||||
```
|
||||
|
||||
|
||||
## How are objects passed across the ABI boundary when interacting with the dispatcher?
|
||||
|
||||
When interacting with the dispatcher via the stable APIs (``STABLE_TORCH_LIBRARY`` etc.) we use a boxed convention. Arguments and returns are represented as a stack of ``StableIValue`` which correlates with a `torch::jit::stack` of IValues. We discuss the following below
|
||||
1. StableIValue Conversions
|
||||
2. StableIValue stack Conventions
|
||||
3. Stable APIs that interact with the dispatcher
|
||||
|
||||
### StableIValue Conversions
|
||||
|
||||
We provide utilities for users to convert objects to and from StableIValues with the synonymous
|
||||
`to` and `from` APIs in `torch/csrc/stable/stableivalue_conversions.h`. We document the stable custom extension representation, libtorch representation and StableIValue
|
||||
representations below. Our confidently supported types are the ones in the table that have completed
|
||||
rows. You can rely on this subset for proper ABI stability, meaning that you can call `to<T_custom_ext>(arg/ret)` or `from(T)` on these types.
|
||||
|
||||
For a limited set of use cases, we also implicitly support any literal type that is representable within 64 bits as StableIValues, as the default reinterpret_cast will succeed. (For example: c10::Device.) These types are currently ABI-stable on best effort but might break in the future and thus should be used for short term testing only.
|
||||
|
||||
You can always work with StableIValue abstractions in your custom kernel for types such as c10::Device even if there is no standard defined representation of device in custom extensions by not introspecting into the StableIValue. For example, a custom operator can take as argument a StableIValue device and directly pass it through to an aten operator with `aoti_torch_call_dispatcher`.
|
||||
|
||||
|
||||
1. type in custom extension: type used within the end user custom library.
|
||||
2. StableIValue representation: a stable conversion of the type to liaison between the user model vs libtorch.so in an ABI-stable manner.
|
||||
3. type in libtorch: type used within libtorch.so (or any code binary locked with libtorch).
|
||||
@ -31,16 +198,10 @@ This note will eventually contain more details on how to use the APIs in torch/c
|
||||
| ? | ? | c10::SymBool | SymBool |
|
||||
| ? | ? | at::QScheme | QScheme |
|
||||
|
||||
Our confidently supported types are the ones in the table that have completed rows. You can rely on this subset for proper ABI stability.
|
||||
|
||||
For a limited set of use cases, we also implicitly support any literal type that is representable within 64 bits as StableIValues, as the default reinterpret_cast will succeed. (For example: c10::Device.) These types are currently ABI-stable on best effort but might break in the future and thus should be used for short term testing only.
|
||||
### Stack Conventions
|
||||
|
||||
You can always work with StableIValue abstractions in your custom kernel for types such as c10::Device even if there is no standard defined representation of device in custom extensions by not introspecting into the StableIValue. For example, a custom operator can take as argument a StableIValue device and directly pass it through to an aten operator with `aoti_torch_call_dispatcher`.
|
||||
|
||||
|
||||
## How to use stack-based APIs
|
||||
|
||||
`aoti_torch_call_dispatcher` is what we consider a stack-based API because it takes as input a stack of StableIValues, which correlates with a `torch::jit::stack` of IValues. Working with the dispatcher will likely bring you into proximity with stack-based APIs, so we are documenting some invariants:
|
||||
There are two invariants for the stack:
|
||||
|
||||
1. The stack is populated left to right.
|
||||
a. For example, a stack representing arguments `arg0`, `arg1`, and `arg2` will have `arg0` at index 0, `arg1` at index 1, and `arg2` at index 2.
|
||||
@ -49,3 +210,33 @@ You can always work with StableIValue abstractions in your custom kernel for typ
|
||||
2. The stack always has ownership of the objects it holds.
|
||||
a. When calling a stack-based API, you must give owning references to the calling stack and steal references from the returned stack.
|
||||
b. When registering your function to be called with a stack, you must steal references from your argument stack and push onto the stack new references.
|
||||
|
||||
(stack-based-apis)=
|
||||
### Stack-based APIs
|
||||
|
||||
The above is relevant in two places:
|
||||
|
||||
1. `STABLE_TORCH_LIBRARY`
|
||||
Unlike `TORCH_LIBRARY`, the dispatcher expects kernels registered via `STABLE_TORCH_LIBRARY` to be boxed. This means they must have the signature `(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) -> void`.We plan to eventually abstract away the need for manual boxing, but, for the time being, please use `from` and `to`.
|
||||
|
||||
```cpp
|
||||
Tensor my_amax_vec(Tensor t) {
|
||||
std::vector<int64_t> v = {0,1};
|
||||
return amax(t, v, false);
|
||||
}
|
||||
|
||||
void boxed_my_amax_vec(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
auto res = my_amax_vec(to<Tensor>(stack[0]));
|
||||
stack[0] = from(res);
|
||||
}
|
||||
```
|
||||
|
||||
2. `aoti_torch_call_dispatcher`
|
||||
This API allows you to call the PyTorch dispatcher from C/C++ code. It has the following signature:
|
||||
```cpp
|
||||
aoti_torch_call_dispatcher(const char* opName, const char* overloadName, StableIValue* stack);
|
||||
```
|
||||
|
||||
`aoti_torch_call_dispatcher` will call the op overload defined by a given `opName`, `overloadName`, and a stack of
|
||||
StableIValues. This call will populate any return values of the op into the stack in their StableIValue form,
|
||||
with `ret0` at index 0, `ret1` at index 1, and so on.
|
||||
|
||||
@ -3761,27 +3761,6 @@ class NcclProcessGroupWithDispatchedCollectivesTests(
|
||||
dist.all_gather_into_tensor(output_tensor, tensor)
|
||||
self.assertEqual(output_tensor, tensor)
|
||||
|
||||
@requires_nccl()
|
||||
@skip_if_lt_x_gpu(2)
|
||||
def test_allgather_noncontig(self):
|
||||
store = dist.FileStore(self.file_name, self.world_size)
|
||||
dist.init_process_group(
|
||||
"nccl",
|
||||
world_size=self.world_size,
|
||||
rank=self.rank,
|
||||
store=store,
|
||||
)
|
||||
device = "cuda"
|
||||
tensor = (
|
||||
torch.arange(0, 16, device=torch.device(device))
|
||||
.view(2, 2, 2, 2)
|
||||
.to(memory_format=torch.channels_last)
|
||||
)
|
||||
tensor_list = [torch.empty_like(tensor) for _ in range(self.world_size)]
|
||||
dist.all_gather(tensor_list, tensor)
|
||||
for o in tensor_list:
|
||||
self.assertEqual(o, tensor)
|
||||
|
||||
@requires_nccl()
|
||||
@skip_if_lt_x_gpu(1)
|
||||
@parametrize("float8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
|
||||
|
||||
@ -1835,6 +1835,59 @@ class GraphModule(torch.nn.Module):
|
||||
self.assertEqual(ref, res)
|
||||
self.assertEqual(len(counters["graph_break"]), 1)
|
||||
|
||||
def test_311_resume_block_keyerror(self):
|
||||
# https://github.com/pytorch/pytorch/issues/162313
|
||||
flag = True
|
||||
|
||||
def fn(x):
|
||||
x = x + 1
|
||||
torch._dynamo.graph_break()
|
||||
x = x + 2
|
||||
if flag:
|
||||
with torch.no_grad():
|
||||
torch._dynamo.graph_break()
|
||||
x = x + 4
|
||||
else:
|
||||
with torch.no_grad():
|
||||
torch._dynamo.graph_break()
|
||||
x = x + 8
|
||||
return x + 16
|
||||
|
||||
inp = torch.ones(3)
|
||||
opt_fn = torch.compile(fn, backend="eager")
|
||||
self.assertEqual(fn(inp), opt_fn(inp))
|
||||
flag = False
|
||||
self.assertEqual(fn(inp), opt_fn(inp))
|
||||
|
||||
def test_311_resume_block_keyerror2(self):
|
||||
# https://github.com/pytorch/pytorch/issues/166176
|
||||
def fn(x):
|
||||
torch._dynamo.graph_break()
|
||||
with torch.no_grad():
|
||||
with torch.no_grad():
|
||||
torch._dynamo.graph_break()
|
||||
return x + 1
|
||||
|
||||
inp = torch.ones(3)
|
||||
opt_fn = torch.compile(fn, backend="eager")
|
||||
self.assertEqual(fn(inp), opt_fn(inp))
|
||||
|
||||
def test_store_attr_graph_break_key_error(self):
|
||||
# STORE_ATTR on dummy should result in graph break
|
||||
def dummy():
|
||||
pass
|
||||
|
||||
def fn(x):
|
||||
x = x + 2
|
||||
with torch.no_grad():
|
||||
dummy.attr1 = x
|
||||
return x + 4
|
||||
|
||||
inp = torch.ones(3)
|
||||
opt_fn = torch.compile(fn, backend="eager")
|
||||
self.assertEqual(fn(inp), opt_fn(inp))
|
||||
self.assertGreater(len(counters["graph_break"]), 0)
|
||||
|
||||
|
||||
class ContextlibContextManagerTests(torch._dynamo.test_case.TestCase):
|
||||
def setUp(self):
|
||||
|
||||
@ -2016,6 +2016,23 @@ class DecoratorTests(torch._dynamo.test_case.TestCase):
|
||||
|
||||
self.assertEqual(f(), 1)
|
||||
|
||||
def test_error_on_graph_break_nonempty_checkpoint(self):
|
||||
cnts = torch._dynamo.testing.CompileCounter()
|
||||
|
||||
@torch.compile(backend=cnts)
|
||||
def fn(x):
|
||||
x = x + 1
|
||||
x = x + 1
|
||||
x = x + 1
|
||||
with torch._dynamo.error_on_graph_break(True):
|
||||
torch._dynamo.graph_break()
|
||||
return x + 1
|
||||
|
||||
with self.assertRaises(Unsupported):
|
||||
fn(torch.ones(3))
|
||||
|
||||
self.assertEqual(cnts.frame_count, 0)
|
||||
|
||||
def test_nested_compile_fullgraph(self):
|
||||
# Test that fullgraph=True cannot be toggled back by fullgraph=False
|
||||
inp = torch.ones(3)
|
||||
|
||||
@ -7150,48 +7150,6 @@ def forward(self, s77 : torch.SymInt, s27 : torch.SymInt, L_x_ : torch.Tensor):
|
||||
0, sys.monitoring.events.PY_START, old_callback
|
||||
)
|
||||
|
||||
def test_312_local_cell_overlap(self):
|
||||
keys = range(10)
|
||||
allowed = [0, 1, 2, 3]
|
||||
|
||||
def fn(x):
|
||||
x = x + 1
|
||||
torch._dynamo.graph_break()
|
||||
key = [key for key in keys if key in allowed]
|
||||
|
||||
def inner():
|
||||
nonlocal key
|
||||
|
||||
return x + key[0]
|
||||
|
||||
self.assertEqual(
|
||||
fn(torch.ones(3)), torch.compile(fn, backend="eager")(torch.ones(3))
|
||||
)
|
||||
|
||||
def test_311_resume_block_keyerror(self):
|
||||
# https://github.com/pytorch/pytorch/issues/162313
|
||||
flag = True
|
||||
|
||||
def fn(x):
|
||||
x = x + 1
|
||||
torch._dynamo.graph_break()
|
||||
x = x + 2
|
||||
if flag:
|
||||
with torch.no_grad():
|
||||
torch._dynamo.graph_break()
|
||||
x = x + 4
|
||||
else:
|
||||
with torch.no_grad():
|
||||
torch._dynamo.graph_break()
|
||||
x = x + 8
|
||||
return x + 16
|
||||
|
||||
inp = torch.ones(3)
|
||||
opt_fn = torch.compile(fn, backend="eager")
|
||||
self.assertEqual(fn(inp), opt_fn(inp))
|
||||
flag = False
|
||||
self.assertEqual(fn(inp), opt_fn(inp))
|
||||
|
||||
def test_unbind_copy_out(self):
|
||||
def f(eye, out):
|
||||
torch.unbind_copy(eye, out=out)
|
||||
|
||||
@ -899,7 +899,7 @@ class CompiledOptimizerTests(TestCase):
|
||||
compiled = torch.compile(_get_value)
|
||||
|
||||
x = torch.ones(2, 2)
|
||||
mark_static_address(x)
|
||||
mark_static_address(x, guard=True)
|
||||
|
||||
ret_val = compiled(x)
|
||||
|
||||
|
||||
@ -945,35 +945,165 @@ if HAS_CUDA_AND_TRITON:
|
||||
self.assertEqual(num_partitions, 1)
|
||||
|
||||
@torch.library.custom_op("mylib::baz", mutates_args=())
|
||||
def baz(x: torch.Tensor, flag: int) -> torch.Tensor:
|
||||
def baz(x: torch.Tensor) -> torch.Tensor:
|
||||
return x.clone()
|
||||
|
||||
@baz.register_fake
|
||||
def _(x, flag):
|
||||
def _(x):
|
||||
return x.clone()
|
||||
|
||||
def should_partition(x, flag):
|
||||
return flag
|
||||
# custom_should_partition_ops takes effect which lead to 2 partitions
|
||||
torch._inductor.config.custom_should_partition_ops = ["mylib::baz"]
|
||||
|
||||
torch._inductor.scheduler.register_should_partition_rule(
|
||||
torch.ops.mylib.baz.default, should_partition
|
||||
)
|
||||
|
||||
def f(x, flag):
|
||||
def f(x):
|
||||
x = x + 1
|
||||
x = baz(x, flag)
|
||||
x = baz(x)
|
||||
x = x + 1
|
||||
return x
|
||||
|
||||
f_compiled = torch.compile(f, mode="reduce-overhead", fullgraph=True)
|
||||
_, code = run_and_get_code(f_compiled, x, True)
|
||||
_, code = run_and_get_code(f_compiled, x)
|
||||
num_partitions = get_num_partitions(code)
|
||||
self.assertEqual(num_partitions, 2)
|
||||
|
||||
_, code = run_and_get_code(f_compiled, x, False)
|
||||
# update the config should NOT force recompile
|
||||
torch._inductor.config.custom_should_partition_ops = []
|
||||
with torch.compiler.set_stance("fail_on_recompile"):
|
||||
f_compiled(x)
|
||||
|
||||
# run_and_get_code forces recompile. Now we should cache miss, recompile, and
|
||||
# only have 1 partition.
|
||||
_, code = run_and_get_code(f_compiled, x)
|
||||
num_partitions = get_num_partitions(code)
|
||||
self.assertEqual(num_partitions, 1)
|
||||
|
||||
# test that op_overload name takes effect which lead to 2 partitions
|
||||
torch._inductor.config.custom_should_partition_ops = ["mylib::baz.default"]
|
||||
|
||||
f_compiled = torch.compile(f, mode="reduce-overhead", fullgraph=True)
|
||||
_, code = run_and_get_code(f_compiled, x)
|
||||
num_partitions = get_num_partitions(code)
|
||||
self.assertEqual(num_partitions, 2)
|
||||
|
||||
@torch._inductor.config.patch("graph_partition", True)
|
||||
@torch._inductor.config.patch("implicit_fallbacks", True)
|
||||
def test_graph_partition_with_memory_plan_reuse(self):
|
||||
BATCH_SIZE = 16
|
||||
MLP_SIZE = 128
|
||||
HIDDEN_SIZE = 128
|
||||
RANDOM_SEED = 0
|
||||
|
||||
@torch.library.custom_op(
|
||||
"silly::attention",
|
||||
mutates_args=["out"],
|
||||
tags=(torch._C.Tag.cudagraph_unsafe,),
|
||||
)
|
||||
def attention(
|
||||
q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, out: torch.Tensor
|
||||
) -> None:
|
||||
out.copy_(q + k + v)
|
||||
|
||||
@attention.register_fake
|
||||
def _(q, k, v, out):
|
||||
return None
|
||||
|
||||
class ParentModel(torch.nn.Module):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
return x
|
||||
|
||||
class Attention(torch.nn.Module):
|
||||
def __init__(self, mlp_size: int, hidden_size: int) -> None:
|
||||
super().__init__()
|
||||
self.pre_attn = torch.nn.Linear(mlp_size, hidden_size, bias=False)
|
||||
self.post_attn = torch.nn.Linear(hidden_size, mlp_size, bias=False)
|
||||
self.rms_norm_weight = torch.nn.Parameter(torch.ones(hidden_size))
|
||||
|
||||
def rms_norm_ref(self, x: torch.Tensor) -> torch.Tensor:
|
||||
x_f32 = x.float()
|
||||
return (
|
||||
x_f32
|
||||
* torch.rsqrt(
|
||||
torch.mean(x_f32.square(), dim=-1, keepdim=True) + 1e-6
|
||||
)
|
||||
* self.rms_norm_weight
|
||||
).to(x.dtype)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
x = self.pre_attn(x)
|
||||
x = self.rms_norm_ref(x)
|
||||
attn_output = torch.empty_like(x)
|
||||
torch.ops.silly.attention(x, x, x, attn_output)
|
||||
x = attn_output
|
||||
x = self.rms_norm_ref(x)
|
||||
x = self.post_attn(x)
|
||||
return x
|
||||
|
||||
class CompiledAttention(torch.nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
mlp_size: int,
|
||||
hidden_size: int,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.attn = Attention(mlp_size, hidden_size)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
return self.attn(x)
|
||||
|
||||
class CompiledAttentionTwo(CompiledAttention):
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
return self.attn(x) + x
|
||||
|
||||
class SimpleModelWithTwoGraphs(ParentModel):
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
mlp_size: int,
|
||||
hidden_size: int,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.attn_one = CompiledAttention(
|
||||
mlp_size=mlp_size,
|
||||
hidden_size=hidden_size,
|
||||
)
|
||||
self.attn_two = CompiledAttentionTwo(
|
||||
mlp_size=mlp_size,
|
||||
hidden_size=hidden_size,
|
||||
)
|
||||
|
||||
self.hidden_states = torch.zeros((BATCH_SIZE, MLP_SIZE)).cuda()
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
bsz = x.shape[0]
|
||||
# CUDAGraph expects same tensor addresses for each run
|
||||
self.hidden_states[:bsz].copy_(x)
|
||||
x = self.attn_one(self.hidden_states[:bsz])
|
||||
self.hidden_states[:bsz].copy_(x)
|
||||
x = self.attn_two(self.hidden_states[:bsz])
|
||||
return x
|
||||
|
||||
eager_model = (
|
||||
SimpleModelWithTwoGraphs(
|
||||
mlp_size=MLP_SIZE,
|
||||
hidden_size=HIDDEN_SIZE,
|
||||
)
|
||||
.eval()
|
||||
.cuda()
|
||||
)
|
||||
|
||||
compiled_model = torch.compile(eager_model, mode="reduce-overhead")
|
||||
|
||||
inputs = torch.randn(BATCH_SIZE, MLP_SIZE).cuda()
|
||||
|
||||
for _ in range(3):
|
||||
eager_out = eager_model(inputs)
|
||||
compiled_out = compiled_model(inputs)
|
||||
self.assertEqual(eager_out, compiled_out)
|
||||
|
||||
@torch._inductor.config.patch("graph_partition", True)
|
||||
@torch._inductor.config.patch("triton.cudagraph_trees", False)
|
||||
def test_graph_partition_gc(self):
|
||||
@ -2794,6 +2924,22 @@ if HAS_CUDA_AND_TRITON:
|
||||
# 2 graph partitions lead to 2 cudagraph
|
||||
self.assertEqual(self.get_manager().new_graph_id().id, 2)
|
||||
|
||||
def test_graph_partition_view_fallback(self):
|
||||
def f(x):
|
||||
y = x + 1
|
||||
z = torch.ops.aten.view.dtype(y, torch.float8_e4m3fn)
|
||||
z_cpu = z.cpu()
|
||||
u_cuda = z_cpu.cuda()
|
||||
return u_cuda
|
||||
|
||||
compiled_f = torch.compile(f, mode="reduce-overhead")
|
||||
|
||||
for _ in range(3):
|
||||
x = torch.ones(2, dtype=torch.int32, device="cuda")
|
||||
eager_out = f(x)
|
||||
compiled_out = compiled_f(x)
|
||||
self.assertEqual(eager_out, compiled_out)
|
||||
|
||||
@torch._inductor.config.patch("graph_partition", True)
|
||||
def test_graph_partition_log_message(self):
|
||||
def foo(x, y):
|
||||
|
||||
@ -592,6 +592,31 @@ class LoopOrderingTest(TestCase):
|
||||
".run(", 1 + int(inductor_config.benchmark_kernel), exactly=True
|
||||
).run(code[0])
|
||||
|
||||
@inductor_config.patch(
|
||||
{
|
||||
"max_autotune": True,
|
||||
"max_autotune_gemm_backends": "TRITON",
|
||||
"test_configs.max_mm_configs": 4,
|
||||
}
|
||||
)
|
||||
@skipUnless(HAS_GPU and is_big_gpu(), "Need big gpu for max-autotune")
|
||||
def test_interaction_with_multi_template(self):
|
||||
"""
|
||||
Skip MultiTemplateBuffer during loop reordering
|
||||
"""
|
||||
|
||||
@torch.compile
|
||||
def f(x, y):
|
||||
return (x @ y), x + 1
|
||||
|
||||
N = 2
|
||||
x = torch.randn([N, N], device=GPU_TYPE, dtype=torch.bfloat16)
|
||||
y = torch.randn([N, N], device=GPU_TYPE, dtype=torch.bfloat16)
|
||||
|
||||
out, code = run_and_get_code(f, x, y)
|
||||
# didn't fuse due to small savings
|
||||
FileCheck().check_count("@triton.jit", 2, exactly=True).run(code[0])
|
||||
|
||||
def test_fuse_with_scalar_shared_memory(self):
|
||||
"""
|
||||
Make sure if we can fuse two nodes sharing a scalar before,
|
||||
|
||||
@ -1479,6 +1479,29 @@ class TestMaxAutotune(TestCase):
|
||||
# Check that contiguous transform was used
|
||||
FileCheck().check("contiguous_mm").run(code[0])
|
||||
|
||||
@unittest.skipIf(config.cpp_wrapper, "out_dtype override not supported for AOTI")
|
||||
@unittest.skipIf(TEST_WITH_ROCM, "out_dtype override only available on NVIDIA")
|
||||
def test_bmm_out_dtype(self):
|
||||
def f(a, b):
|
||||
return torch.bmm(a, b, out_dtype=torch.float32)
|
||||
|
||||
a = torch.randn(2, 3, 4, device=GPU_TYPE, dtype=torch.float16)
|
||||
b = torch.randn(2, 4, 5, device=GPU_TYPE, dtype=torch.float16)
|
||||
with config.patch(
|
||||
max_autotune=True,
|
||||
max_autotune_gemm_backends="TRITON",
|
||||
):
|
||||
compiled_f = torch.compile(f)
|
||||
with self.assertRaisesRegex(
|
||||
torch._inductor.exc.InductorError,
|
||||
r"LoweringException: NoValidChoicesError: No choices to select",
|
||||
):
|
||||
out, code = run_and_get_code(compiled_f, a, b)
|
||||
|
||||
compiled_f = torch.compile(f)
|
||||
out, code = run_and_get_code(compiled_f, a, b)
|
||||
FileCheck().check("extern_kernels.bmm_dtype").run(code[0])
|
||||
|
||||
def test_triton_template_generated_code_cache_key(self):
|
||||
generate_and_load_args = len(
|
||||
inspect.signature(
|
||||
|
||||
@ -6,7 +6,7 @@ import torch
|
||||
|
||||
from torch.testing import make_tensor
|
||||
from torch.testing._internal.common_utils import \
|
||||
(parametrize, run_tests, TestCase, DeterministicGuard, TEST_WITH_ROCM)
|
||||
(parametrize, run_tests, TestCase, DeterministicGuard, TEST_WITH_ROCM, serialTest)
|
||||
from torch.testing._internal.common_device_type import \
|
||||
(instantiate_device_type_tests, onlyCPU, dtypes, dtypesIfCUDA,
|
||||
toleranceOverride, tol,)
|
||||
@ -65,10 +65,12 @@ class TestScatterGather(TestCase):
|
||||
actual = torch.gather(src, 2, idx)
|
||||
self.assertEqual(actual, expected, atol=0, rtol=0)
|
||||
|
||||
@serialTest()
|
||||
@dtypes(torch.int8, torch.bfloat16)
|
||||
def test_gather_large(self, device, dtype):
|
||||
# test larger shapes to check vectorized implementation
|
||||
for (m, n, k) in ((4096, 3072, 4096), (4096, 3072, 4100)):
|
||||
for (m, n, k) in ((4096, 3072, 4096), (4096, 3072, 4100), (4, 4, 16384 * 8192)):
|
||||
torch.cuda.empty_cache()
|
||||
src = make_tensor((m, k), device=device, dtype=dtype)
|
||||
alloc0 = torch.empty(src.nelement() * 2, device=device, dtype=dtype)
|
||||
discontig = alloc0.view(m, 2 * k)[:, ::2].copy_(src)
|
||||
@ -111,6 +113,8 @@ class TestScatterGather(TestCase):
|
||||
self.assertEqual(res_ind, ref, atol=0, rtol=0)
|
||||
res_gather = torch.gather(misaligned1, dim=dim, index=ind)
|
||||
self.assertEqual(res_gather, ref, atol=0, rtol=0)
|
||||
del src, alloc0, alloc1, alloc2
|
||||
del discontig, misaligned, misaligned1
|
||||
# test gather along 1st dim that can accidentally trigger fast path
|
||||
# because due to index dimension in the gather dim being 1
|
||||
# an unexpected squashing in tensorIterator happens
|
||||
|
||||
@ -2855,6 +2855,30 @@ class TestSDPACudaOnly(NNTestCase):
|
||||
out = torch.nn.functional.scaled_dot_product_attention(q, q, q, dropout_p=0.5)
|
||||
out.backward(grad)
|
||||
|
||||
@skipIfRocm
|
||||
@unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cudnn Attention is not supported on this system")
|
||||
def test_cudnn_attention_broken_166211(self):
|
||||
# https://github.com/pytorch/pytorch/issues/166211#issue-3551350377
|
||||
shape = (20, 4, 4, 32)
|
||||
scale = 10
|
||||
for i in range(100):
|
||||
q = torch.randn(*shape, device='cuda', dtype=torch.bfloat16) * scale
|
||||
k = torch.randn(*shape, device='cuda', dtype=torch.bfloat16) * scale
|
||||
v = torch.randn(*shape, device='cuda', dtype=torch.bfloat16) * scale
|
||||
q.requires_grad = True
|
||||
k.requires_grad = True
|
||||
v.requires_grad = True
|
||||
|
||||
grad_attn_output = torch.randn(*shape, device='cuda', dtype=torch.bfloat16) * scale
|
||||
|
||||
with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.CUDNN_ATTENTION):
|
||||
attn_output = torch.nn.functional.scaled_dot_product_attention(q, k, v)
|
||||
dq, dk, dv = torch.autograd.grad(outputs=attn_output, inputs=(q, k, v), grad_outputs=grad_attn_output)
|
||||
|
||||
self.assertFalse(dq.isnan().any())
|
||||
self.assertFalse(dk.isnan().any())
|
||||
self.assertFalse(dv.isnan().any())
|
||||
|
||||
@unittest.skipIf(not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Fused SDPA was not built for this system")
|
||||
@parametrize("mask_dim", [1, 2, 3, 4])
|
||||
def test_mem_efficient_attention_mask_variants(self, device, mask_dim: list[int]):
|
||||
|
||||
2
third_party/cudnn_frontend
vendored
2
third_party/cudnn_frontend
vendored
Submodule third_party/cudnn_frontend updated: f937055efc...243c7ff63b
@ -430,7 +430,7 @@ use_numpy_random_stream = False
|
||||
enable_cpp_guard_manager = True
|
||||
|
||||
# Use C++ guard manager for symbolic shapes
|
||||
enable_cpp_symbolic_shape_guards = not is_fbcode()
|
||||
enable_cpp_symbolic_shape_guards = False
|
||||
|
||||
# Enable tracing through contextlib.contextmanager
|
||||
enable_trace_contextlib = True
|
||||
|
||||
@ -752,12 +752,13 @@ def mark_static(
|
||||
|
||||
|
||||
@forbid_in_graph
|
||||
def mark_static_address(t: Any, guard: bool = True) -> None:
|
||||
def mark_static_address(t: Any, guard: bool = False) -> None:
|
||||
"""
|
||||
Marks an input tensor whose data_ptr will not change across multiple calls
|
||||
to a dynamo-compiled function. This indicates to cudagraphs that an extra allocation
|
||||
is not needed for this input. The data_ptr will be guarded if guard=True. Note:
|
||||
Tensors marked in this way will be kept alive until `torch._dynamo.reset()` is called.
|
||||
Marks an input tensor whose address should be treated as constant across calls to the
|
||||
same dynamo-compiled function. This indicates to cudagraphs that an extra allocation
|
||||
is not needed for this input. The data_ptr will be guarded if guard=True, and cause a full
|
||||
recompile if the data_ptr changes. Note: If this address changes, cudagraphs will re-record
|
||||
if guard=False.
|
||||
"""
|
||||
if not isinstance(t, torch.Tensor):
|
||||
raise TypeError(f"mark_static_address expects a tensor but received {type(t)}")
|
||||
|
||||
@ -250,8 +250,8 @@ class ResumeFunctionMetadata:
|
||||
default_factory=list
|
||||
)
|
||||
# per-offset map from new block target offsets to original block target offsets
|
||||
block_target_offset_remap: dict[int, dict[int, int]] = dataclasses.field(
|
||||
default_factory=dict
|
||||
block_target_offset_remap: dict[tuple[int, int], dict[int, int]] = (
|
||||
dataclasses.field(default_factory=dict)
|
||||
)
|
||||
|
||||
|
||||
@ -291,12 +291,14 @@ class ContinueExecutionCache:
|
||||
generated_code_metadata = ExactWeakKeyDictionary()
|
||||
|
||||
@classmethod
|
||||
def lookup(cls, code: types.CodeType, lineno: int, *key: Any) -> types.CodeType:
|
||||
def lookup(
|
||||
cls, code: types.CodeType, lineno: int, init_offset: int, *key: Any
|
||||
) -> types.CodeType:
|
||||
if code not in cls.cache:
|
||||
cls.cache[code] = {}
|
||||
key = tuple(key)
|
||||
if key not in cls.cache[code]:
|
||||
cls.cache[code][key] = cls.generate(code, lineno, *key)
|
||||
cls.cache[code][key] = cls.generate(code, lineno, init_offset, *key)
|
||||
return cls.cache[code][key]
|
||||
|
||||
@classmethod
|
||||
@ -304,7 +306,8 @@ class ContinueExecutionCache:
|
||||
cls,
|
||||
code: types.CodeType,
|
||||
lineno: int,
|
||||
offset: int,
|
||||
init_offset: int,
|
||||
resume_offset: int,
|
||||
setup_fn_target_offsets: tuple[int, ...], # only used in Python 3.11+
|
||||
nstack: int,
|
||||
argnames: tuple[str, ...],
|
||||
@ -317,7 +320,7 @@ class ContinueExecutionCache:
|
||||
# which prevents excessive recompilation of inner frames
|
||||
nested_code_objs: tuple[types.CodeType],
|
||||
) -> types.CodeType:
|
||||
assert offset is not None
|
||||
assert resume_offset is not None
|
||||
assert not (
|
||||
code.co_flags
|
||||
& (CO_GENERATOR | CO_COROUTINE | CO_ITERABLE_COROUTINE | CO_ASYNC_GENERATOR)
|
||||
@ -327,7 +330,8 @@ class ContinueExecutionCache:
|
||||
return cls.generate_based_on_original_code_object(
|
||||
code,
|
||||
lineno,
|
||||
offset,
|
||||
init_offset,
|
||||
resume_offset,
|
||||
setup_fn_target_offsets,
|
||||
nstack,
|
||||
argnames,
|
||||
@ -382,7 +386,7 @@ class ContinueExecutionCache:
|
||||
code_options["co_flags"] = code_options["co_flags"] & ~(
|
||||
CO_VARARGS | CO_VARKEYWORDS
|
||||
)
|
||||
target = next(i for i in instructions if i.offset == offset)
|
||||
target = next(i for i in instructions if i.offset == resume_offset)
|
||||
|
||||
prefix = []
|
||||
if is_py311_plus:
|
||||
@ -575,7 +579,8 @@ class ContinueExecutionCache:
|
||||
cls,
|
||||
code: types.CodeType,
|
||||
lineno: int,
|
||||
offset: int,
|
||||
init_offset: int,
|
||||
resume_offset: int,
|
||||
setup_fn_target_offsets: tuple[int, ...],
|
||||
*args: Any,
|
||||
) -> types.CodeType:
|
||||
@ -590,34 +595,63 @@ class ContinueExecutionCache:
|
||||
meta: ResumeFunctionMetadata = ContinueExecutionCache.generated_code_metadata[
|
||||
code
|
||||
]
|
||||
new_offset = -1
|
||||
|
||||
def find_new_offset(
|
||||
instructions: list[Instruction], code_options: dict[str, Any]
|
||||
) -> None:
|
||||
nonlocal new_offset
|
||||
(target,) = (i for i in instructions if i.offset == offset)
|
||||
# match the functions starting at the last instruction as we have added a prefix
|
||||
(new_target,) = (
|
||||
i2
|
||||
for i1, i2 in zip(reversed(instructions), reversed(meta.instructions))
|
||||
if i1 is target
|
||||
)
|
||||
assert target.opcode == new_target.opcode
|
||||
assert new_target.offset is not None
|
||||
new_offset = new_target.offset
|
||||
def find_orig_offset(cur_offset: int) -> int:
|
||||
orig_offset = -1
|
||||
|
||||
transform_code_object(code, find_new_offset)
|
||||
assert new_offset >= 0
|
||||
def find_orig_offset_transform(
|
||||
instructions: list[Instruction], code_options: dict[str, Any]
|
||||
) -> None:
|
||||
nonlocal orig_offset
|
||||
(target,) = (i for i in instructions if i.offset == cur_offset)
|
||||
# match the functions starting at the last instruction as we have added a prefix
|
||||
new_target_tuple = tuple(
|
||||
i2
|
||||
for i1, i2 in zip(
|
||||
reversed(instructions), reversed(meta.instructions)
|
||||
)
|
||||
if i1 is target
|
||||
)
|
||||
|
||||
if not new_target_tuple:
|
||||
# Instruction with cur_offset in instructions was not found
|
||||
# in the original code - orig_offset left as -1.
|
||||
# Caller expected to handle this case.
|
||||
return
|
||||
|
||||
assert len(new_target_tuple) == 1
|
||||
new_target = new_target_tuple[0]
|
||||
|
||||
assert target.opcode == new_target.opcode
|
||||
assert new_target.offset is not None
|
||||
orig_offset = new_target.offset
|
||||
|
||||
transform_code_object(code, find_orig_offset_transform)
|
||||
return orig_offset
|
||||
|
||||
orig_init_offset = find_orig_offset(init_offset)
|
||||
# It is fine if the initial instruction is not found in the original code;
|
||||
# this means we graph broke in the prefix, which only happens with nested graph breaks.
|
||||
# We should not be running into ambiguous graph break issues here.
|
||||
orig_resume_offset = find_orig_offset(resume_offset)
|
||||
assert orig_resume_offset > -1, (
|
||||
"resume instruction not found in original code - this is a bug."
|
||||
)
|
||||
|
||||
if sys.version_info >= (3, 11):
|
||||
# setup_fn_target_offsets currently contains the target offset of
|
||||
# each setup_fn, based on `code`. When we codegen the resume function
|
||||
# based on the original code object, `meta.code`, the offsets in
|
||||
# setup_fn_target_offsets must be based on `meta.code` instead.
|
||||
if new_offset not in meta.block_target_offset_remap:
|
||||
offset_key = (orig_init_offset, orig_resume_offset)
|
||||
# NOTE: we key by offset_key since the same resume function may graph
|
||||
# break in multiple places and we need different block_target_offset_remap's
|
||||
# for each graph break location. Keying by orig_resume_offset may not be enough
|
||||
# if 2 graph breaks on different initial offsets resume on the same instruction
|
||||
# (although this is rare and not tested anywhere).
|
||||
if offset_key not in meta.block_target_offset_remap:
|
||||
block_target_offset_remap = meta.block_target_offset_remap[
|
||||
new_offset
|
||||
offset_key
|
||||
] = {}
|
||||
|
||||
def remap_block_offsets(
|
||||
@ -625,11 +659,15 @@ class ContinueExecutionCache:
|
||||
) -> None:
|
||||
# NOTE: each prefix block generates exactly one PUSH_EXC_INFO,
|
||||
# so we can tell which block a prefix PUSH_EXC_INFO belongs to,
|
||||
# by counting. Then we can use meta.prefix_block-target_offset_remap
|
||||
# by counting. Then we can use meta.prefix_block_target_offset_remap
|
||||
# to determine where in the original code the PUSH_EXC_INFO offset
|
||||
# replaced.
|
||||
prefix_blocks: list[Instruction] = []
|
||||
for inst in instructions:
|
||||
# NOTE meta.prefix_block_target_offset_remap is based off of how we codegen'd
|
||||
# context managers at the prefix/prologue of the resume function. It is the same for
|
||||
# every graph break in the same resume function, so we do not need to recompute
|
||||
# for each graph break (unlike for meta.block_target_offset_remap)
|
||||
if len(prefix_blocks) == len(
|
||||
meta.prefix_block_target_offset_remap
|
||||
):
|
||||
@ -637,38 +675,49 @@ class ContinueExecutionCache:
|
||||
if inst.opname == "PUSH_EXC_INFO":
|
||||
prefix_blocks.append(inst)
|
||||
|
||||
# offsets into prefix
|
||||
# remap block target offsets for blocks generated in the resume prefix
|
||||
for inst, o in zip(
|
||||
prefix_blocks, meta.prefix_block_target_offset_remap
|
||||
):
|
||||
block_target_offset_remap[cast(int, inst.offset)] = o
|
||||
|
||||
# old bytecode targets are after the prefix PUSH_EXC_INFO's
|
||||
old_start_offset = (
|
||||
# current bytecode targets are after the prefix PUSH_EXC_INFO's
|
||||
cur_start_offset = (
|
||||
cast(int, prefix_blocks[-1].offset) if prefix_blocks else -1
|
||||
)
|
||||
# offsets into old bytecode
|
||||
old_inst_offsets = sorted(
|
||||
n for n in setup_fn_target_offsets if n > old_start_offset
|
||||
# get the remaining block target offsets of the current bytecode
|
||||
cur_inst_offsets = sorted(
|
||||
n for n in setup_fn_target_offsets if n > cur_start_offset
|
||||
)
|
||||
targets = _filter_iter(
|
||||
instructions, old_inst_offsets, lambda inst, o: inst.offset == o
|
||||
instructions, cur_inst_offsets, lambda inst, o: inst.offset == o
|
||||
)
|
||||
new_targets = _filter_iter(
|
||||
zip(reversed(instructions), reversed(meta.instructions)),
|
||||
targets,
|
||||
lambda v1, v2: v1[0] is v2,
|
||||
# The original code and resume code should have matching suffixes.
|
||||
# Match the post-prefix block target offsets of the current resume code
|
||||
# and the original code.
|
||||
orig_targets = reversed(
|
||||
_filter_iter(
|
||||
zip(reversed(instructions), reversed(meta.instructions)),
|
||||
reversed(targets),
|
||||
lambda v1, v2: v1[0] is v2,
|
||||
)
|
||||
)
|
||||
for new, old in zip(new_targets, targets):
|
||||
block_target_offset_remap[old.offset] = new[1].offset
|
||||
for orig, cur in zip(orig_targets, targets):
|
||||
block_target_offset_remap[cur.offset] = orig[1].offset
|
||||
|
||||
transform_code_object(code, remap_block_offsets)
|
||||
|
||||
# if offset is not in setup_fn_target_offsets, it is an error
|
||||
# if offset_key or offset is not in setup_fn_target_offsets, it is an error
|
||||
# that needs to be fixed
|
||||
setup_fn_target_offsets = tuple(
|
||||
meta.block_target_offset_remap[new_offset][n]
|
||||
meta.block_target_offset_remap[offset_key][n]
|
||||
for n in setup_fn_target_offsets
|
||||
)
|
||||
return ContinueExecutionCache.lookup(
|
||||
meta.code, lineno, new_offset, setup_fn_target_offsets, *args
|
||||
meta.code,
|
||||
lineno,
|
||||
orig_init_offset,
|
||||
orig_resume_offset,
|
||||
setup_fn_target_offsets,
|
||||
*args,
|
||||
)
|
||||
|
||||
@ -1355,6 +1355,19 @@ class InstructionTranslatorBase(
|
||||
except (ReturnValueOp, YieldValueOp):
|
||||
return False
|
||||
except Unsupported:
|
||||
# More restrictive condition than should_compile_partial_graph:
|
||||
# if this condition is true, then we SHOULD NOT attempt to find
|
||||
# a previous checkpoint to resume from and try to resume - we should
|
||||
# immediately error out.
|
||||
# The condition is more restrictive because, it may be possible to resume significantly earlier
|
||||
# in the code (the most recent speculation point). This happens, for example, in the case
|
||||
# of a graph break in a try block.
|
||||
if (
|
||||
self.one_graph
|
||||
or self.error_on_graph_break
|
||||
or self.is_tracing_resume_prologue
|
||||
):
|
||||
raise
|
||||
if self.current_speculation is None:
|
||||
log.debug("empty checkpoint")
|
||||
raise
|
||||
@ -2479,7 +2492,9 @@ class InstructionTranslatorBase(
|
||||
reason=GraphCompileReason("store_attr", [self.frame_summary()]),
|
||||
stack_pops=2,
|
||||
)
|
||||
self.output.add_output_instructions([copy.copy(inst)])
|
||||
inst_copy = copy.copy(inst)
|
||||
inst_copy.exn_tab_entry = None
|
||||
self.output.add_output_instructions([inst_copy])
|
||||
self.popn(2)
|
||||
self.output.add_output_instructions(
|
||||
self.create_call_resume_at(
|
||||
@ -2679,6 +2694,7 @@ class InstructionTranslatorBase(
|
||||
if sys.version_info < (3, 12):
|
||||
assert len(argnames_null) == 0, "variables should not be NULL in < 3.12"
|
||||
|
||||
assert cur_tx.current_instruction.offset is not None
|
||||
# compile_subgraph did not codegen any NULLs,
|
||||
# so we should not count NullVariables
|
||||
stack_len = len(cur_tx.stack) - len(meta.stack_null_idxes)
|
||||
@ -2686,7 +2702,8 @@ class InstructionTranslatorBase(
|
||||
new_code: types.CodeType = ContinueExecutionCache.lookup(
|
||||
cur_tx.f_code,
|
||||
cur_tx.lineno,
|
||||
resume_inst.offset,
|
||||
cur_tx.current_instruction.offset,
|
||||
resume_inst.offset, # type: ignore[arg-type]
|
||||
tuple(b.target.offset for b in cur_tx.block_stack),
|
||||
stack_len,
|
||||
argnames,
|
||||
|
||||
@ -147,7 +147,7 @@ class OptimizerVariable(UserDefinedObjectVariable):
|
||||
|
||||
for group in self.value.param_groups:
|
||||
for p in group["params"]:
|
||||
mark_static_address(p)
|
||||
mark_static_address(p, guard=True)
|
||||
|
||||
self._set_capturable(tx)
|
||||
|
||||
@ -240,7 +240,7 @@ class OptimizerVariable(UserDefinedObjectVariable):
|
||||
self.tensor_to_source = {}
|
||||
|
||||
def mark_static(x):
|
||||
mark_static_address(x)
|
||||
mark_static_address(x, guard=True)
|
||||
|
||||
tree_map_only(torch.Tensor, mark_static, self.value.state)
|
||||
|
||||
@ -348,14 +348,14 @@ class OptimizerVariable(UserDefinedObjectVariable):
|
||||
|
||||
if tensor_value in self.tensor_to_source:
|
||||
# mark these tensors as static for cudagraphs
|
||||
mark_static_address(tensor_value)
|
||||
mark_static_address(tensor_value, guard=True)
|
||||
source = self.tensor_to_source[tensor_value]
|
||||
self.static_tensor_names.add(tx.output.module_key_name(source.name()))
|
||||
elif tensor_value in self.grad_to_source:
|
||||
source = self.grad_to_source[tensor_value]
|
||||
else:
|
||||
# mark these tensors as static for cudagraphs
|
||||
mark_static_address(tensor_value)
|
||||
mark_static_address(tensor_value, guard=True)
|
||||
|
||||
global_name = tx.store_global_weakref_by_id(GLOBAL_KEY_PREFIX, tensor_value)
|
||||
source = GlobalWeakRefSource(global_name)
|
||||
|
||||
@ -94,7 +94,7 @@ def _default_custom_combo_kernel_horizontal_partition(
|
||||
]
|
||||
short_reduction = [n for n in reduction if n not in long_reduction]
|
||||
if long_reduction:
|
||||
log.warning(
|
||||
log.debug(
|
||||
"ComboKernels: %d long reduction nodes are separated",
|
||||
len(long_reduction),
|
||||
)
|
||||
@ -107,7 +107,7 @@ def _default_custom_combo_kernel_horizontal_partition(
|
||||
]
|
||||
if large_pointwise:
|
||||
# TODO benchmark the performance when large pointwise nodes combining with others
|
||||
log.warning(
|
||||
log.debug(
|
||||
"ComboKernels: %d large pointwise nodes are separated",
|
||||
len(large_pointwise),
|
||||
)
|
||||
|
||||
@ -1700,7 +1700,8 @@ class PythonWrapperCodegen(CodeGen):
|
||||
self.lines = MemoryPlanner(self).plan(self.lines)
|
||||
|
||||
def memory_plan_reuse(self):
|
||||
out_names = V.graph.get_output_names()
|
||||
outputs = self.get_graph_outputs()
|
||||
out_names = V.graph._get_output_names(outputs)
|
||||
|
||||
while (
|
||||
self.lines
|
||||
|
||||
@ -465,6 +465,10 @@ graph_partition: bool = (
|
||||
== "1"
|
||||
)
|
||||
|
||||
# register ops upon which inductor should partition the graph. name format should be
|
||||
# "namespace::kernel_name" (e.g., aten::mm) for op overload packet, or
|
||||
# "namespace::kernel_name.overload" (e.g., aten::mm.default).
|
||||
custom_should_partition_ops: list[str] = []
|
||||
|
||||
# force cublas and triton to use the same precision; cublas supports TF32 for matmul operations
|
||||
# when m, n, k are multiples of 16, 16, 8, whereas triton supports TF32 for matmul operations
|
||||
|
||||
@ -2410,11 +2410,11 @@ class GraphLowering(torch.fx.Interpreter):
|
||||
|
||||
return mod
|
||||
|
||||
def get_output_names(self) -> list[str]:
|
||||
def _get_output_names(self, graph_outputs: list[ir.IRNode]) -> list[str]:
|
||||
names = []
|
||||
shape_counter = itertools.count(0)
|
||||
none_counter = itertools.count(0)
|
||||
for node in self.graph_outputs:
|
||||
for node in graph_outputs:
|
||||
if isinstance(node, ir.NoneAsConstantBuffer):
|
||||
names.append(f"{self.name}_none{next(none_counter)}")
|
||||
elif isinstance(node, ir.ShapeAsConstantBuffer):
|
||||
@ -2423,6 +2423,9 @@ class GraphLowering(torch.fx.Interpreter):
|
||||
names.append(node.get_name())
|
||||
return names
|
||||
|
||||
def get_output_names(self) -> list[str]:
|
||||
return self._get_output_names(self.graph_outputs)
|
||||
|
||||
def is_unspec_arg(self, name: str) -> bool:
|
||||
# dynamo wraps unspec variable as 0d CPU tensor,
|
||||
# need to convert to scalar during codegen (triton only)
|
||||
|
||||
@ -64,6 +64,7 @@ from torch.fx.experimental.symbolic_shapes import (
|
||||
compute_unbacked_bindings,
|
||||
free_symbols,
|
||||
free_unbacked_symbols,
|
||||
IterateExprs,
|
||||
rebind_unbacked,
|
||||
resolve_unbacked_bindings,
|
||||
ShapeEnv,
|
||||
@ -97,6 +98,7 @@ from .utils import (
|
||||
argsort,
|
||||
argsort_sym,
|
||||
cache_on_self,
|
||||
cache_on_self_and_args,
|
||||
ceildiv,
|
||||
convert_shape_to_inductor,
|
||||
convert_shape_to_symint,
|
||||
@ -933,6 +935,7 @@ class Loops(IRNode):
|
||||
inner_fn: Callable[..., Any]
|
||||
ranges: Sequence[_IntLike]
|
||||
|
||||
@cache_on_self_and_args("Loops")
|
||||
def get_free_symbol_uses(
|
||||
self, unbacked_only: bool = False
|
||||
) -> OrderedSet[sympy.Symbol]:
|
||||
@ -1222,6 +1225,7 @@ class Reduction(Loops):
|
||||
|
||||
__repr__ = __str__
|
||||
|
||||
@cache_on_self_and_args("Reduction")
|
||||
def get_free_symbol_uses(self, unbacked_only: bool = False) -> OrderedSet[Symbol]:
|
||||
return super().get_free_symbol_uses(unbacked_only) | OrderedSet().union(
|
||||
*(get_free_symbols(e, unbacked_only) for e in self.reduction_ranges)
|
||||
@ -2311,6 +2315,7 @@ class Scan(Loops):
|
||||
|
||||
# HACK we mimic reduction
|
||||
|
||||
@cache_on_self_and_args("Scan")
|
||||
def get_free_symbol_uses(self, unbacked_only: bool = False) -> OrderedSet[Symbol]:
|
||||
# TODO: Can combine_fn/reindex close over unbacked symbols? If so, we
|
||||
# need to explicitly represent the closure so we can pull out unbacked
|
||||
@ -2520,6 +2525,7 @@ class Sort(Loops):
|
||||
|
||||
# HACK we mimic reduction
|
||||
|
||||
@cache_on_self_and_args("Sort")
|
||||
def get_free_symbol_uses(self, unbacked_only: bool = False) -> OrderedSet[Symbol]:
|
||||
return (
|
||||
super().get_free_symbol_uses(unbacked_only)
|
||||
@ -2768,6 +2774,7 @@ def is_unaligned(node: IRNode) -> bool:
|
||||
class BaseView(IRNode):
|
||||
data: IRNode
|
||||
|
||||
@cache_on_self_and_args("BaseView")
|
||||
def get_free_symbol_uses(self, unbacked_only: bool = False) -> OrderedSet[Symbol]:
|
||||
return self.data.get_free_symbol_uses(unbacked_only)
|
||||
|
||||
@ -3334,6 +3341,7 @@ class ReinterpretView(BaseView):
|
||||
def freeze_layout(self) -> None:
|
||||
pass
|
||||
|
||||
@cache_on_self_and_args("ReinterpretView")
|
||||
def get_free_symbol_uses(
|
||||
self, unbacked_only: bool = False
|
||||
) -> OrderedSet[sympy.Symbol]:
|
||||
@ -3617,13 +3625,37 @@ class Layout(OutputSpec):
|
||||
self.dtype = dtype
|
||||
assert len(size) == len(stride), f"size={size}, stride={stride}"
|
||||
assert all(isinstance(s, (Expr, int)) for s in size)
|
||||
self.size = size
|
||||
self.stride = stride
|
||||
self.offset = offset
|
||||
self._size = size
|
||||
self._stride = stride
|
||||
self._offset = offset
|
||||
self.is_pinned = is_pinned
|
||||
# is_pinned implies cpu
|
||||
assert (not self.is_pinned) or (self.device.type == "cpu")
|
||||
|
||||
@property
|
||||
def size(self) -> Sequence[Expr]:
|
||||
return self._size
|
||||
|
||||
@size.setter
|
||||
def size(self, value: Sequence[Expr]) -> None:
|
||||
self._size = value
|
||||
|
||||
@property
|
||||
def stride(self) -> Sequence[Expr]:
|
||||
return self._stride
|
||||
|
||||
@stride.setter
|
||||
def stride(self, value: Sequence[Expr]) -> None:
|
||||
self._stride = value
|
||||
|
||||
@property
|
||||
def offset(self) -> Expr:
|
||||
return self._offset
|
||||
|
||||
@offset.setter
|
||||
def offset(self, value: Expr) -> None:
|
||||
self._offset = value
|
||||
|
||||
def __str__(self) -> str:
|
||||
offset = ""
|
||||
if self.offset != 0:
|
||||
@ -3833,6 +3865,7 @@ class Layout(OutputSpec):
|
||||
def storage_size(self) -> Expr:
|
||||
return compute_required_storage_length(self.size, self.stride, self.offset) # type: ignore[arg-type]
|
||||
|
||||
@cache_on_self_and_args("Layout")
|
||||
def get_free_symbol_uses(
|
||||
self, unbacked_only: bool = False
|
||||
) -> OrderedSet[sympy.Symbol]:
|
||||
@ -3852,7 +3885,11 @@ class FixedLayout(Layout):
|
||||
|
||||
|
||||
class FlexibleLayout(Layout):
|
||||
"""A Tensor layout that we are allowed to change"""
|
||||
"""
|
||||
A Tensor layout that we are allowed to change
|
||||
|
||||
Assumption: layout change should NOT add or remove free symbols
|
||||
"""
|
||||
|
||||
allow_indexing = False
|
||||
|
||||
@ -3937,6 +3974,33 @@ class FlexibleLayout(Layout):
|
||||
fill_order = sorted(range(len(stride)), key=stride.__getitem__)
|
||||
return FlexibleLayout.fill_ordered(sizes, fill_order)
|
||||
|
||||
@property
|
||||
def size(self) -> Sequence[Expr]:
|
||||
return self._size
|
||||
|
||||
@size.setter
|
||||
def size(self, value: Sequence[Expr]) -> None:
|
||||
self.assert_free_symbol_uses_unchanged("size", value)
|
||||
self._size = value
|
||||
|
||||
@property
|
||||
def stride(self) -> Sequence[Expr]:
|
||||
return self._stride
|
||||
|
||||
@stride.setter
|
||||
def stride(self, value: Sequence[Expr]) -> None:
|
||||
self.assert_free_symbol_uses_unchanged("stride", value)
|
||||
self._stride = value
|
||||
|
||||
@property
|
||||
def offset(self) -> Expr:
|
||||
return self._offset
|
||||
|
||||
@offset.setter
|
||||
def offset(self, value: Expr) -> None:
|
||||
self.assert_free_symbol_uses_unchanged("offset", value)
|
||||
self._offset = value
|
||||
|
||||
def as_stride_order(
|
||||
self, order: Sequence[int], allow_padding: bool = False
|
||||
) -> FixedLayout:
|
||||
@ -3995,6 +4059,25 @@ class FlexibleLayout(Layout):
|
||||
self.is_pinned,
|
||||
)
|
||||
|
||||
def get_initial_free_symbol_uses(self) -> dict[tuple[str, bool], sympy.Symbol]:
|
||||
initial_free_symbols = {}
|
||||
for name in ["size", "stride", "offset"]:
|
||||
for unbacked_only in [True, False]:
|
||||
key = (name, unbacked_only)
|
||||
initial_free_symbols[key] = OrderedSet(
|
||||
get_free_symbols(getattr(self, name), unbacked_only)
|
||||
)
|
||||
|
||||
return initial_free_symbols
|
||||
|
||||
def assert_free_symbol_uses_unchanged(self, name: str, value: IterateExprs) -> None:
|
||||
for unbacked_only in [True, False]:
|
||||
old_free_symbols = self.initial_free_symbols[(name, unbacked_only)]
|
||||
new_free_symbols = OrderedSet(get_free_symbols(value, unbacked_only))
|
||||
assert new_free_symbols == old_free_symbols, (
|
||||
f"Expected free symbols unchanged, but got {new_free_symbols} vs {old_free_symbols}"
|
||||
)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
device: torch.device,
|
||||
@ -4009,6 +4092,10 @@ class FlexibleLayout(Layout):
|
||||
strides = FlexibleLayout.contiguous_strides(size)
|
||||
super().__init__(device, dtype, size, strides, is_pinned=is_pinned)
|
||||
|
||||
# record the initial free symbols to check that we do not add new free symbols
|
||||
# later when modifying sizes, strides, and offsets.
|
||||
self.initial_free_symbols = self.get_initial_free_symbol_uses()
|
||||
|
||||
|
||||
class NonOwningLayout(Layout):
|
||||
"""Is a view into the storage of another tensor"""
|
||||
@ -4034,6 +4121,7 @@ class NonOwningLayout(Layout):
|
||||
|
||||
return V.graph.sizevars.statically_known_multiple_of(offset, ALIGNMENT)
|
||||
|
||||
@cache_on_self_and_args("NonOwningLayout")
|
||||
def get_free_symbol_uses(
|
||||
self, unbacked_only: bool = False
|
||||
) -> OrderedSet[sympy.Symbol]:
|
||||
@ -4322,6 +4410,7 @@ class Buffer(IRNode, CodegenSymbol):
|
||||
def get_read_names(self) -> OrderedSet[str]:
|
||||
return OrderedSet([self.get_name()])
|
||||
|
||||
@cache_on_self_and_args("Buffer")
|
||||
def get_free_symbol_uses(
|
||||
self, unbacked_only: bool = False
|
||||
) -> OrderedSet[sympy.Symbol]:
|
||||
@ -4394,6 +4483,7 @@ class NoneAsConstantBuffer(IRNode):
|
||||
def get_reads(self) -> OrderedSet[Dep]:
|
||||
return OrderedSet()
|
||||
|
||||
@cache_on_self_and_args("NoneAsConstantBuffer")
|
||||
def get_free_symbol_uses(
|
||||
self, unbacked_only: bool = False
|
||||
) -> OrderedSet[sympy.Symbol]:
|
||||
@ -4413,6 +4503,7 @@ class NoneAsConstantBuffer(IRNode):
|
||||
class ShapeAsConstantBuffer(IRNode):
|
||||
expr: Expr
|
||||
|
||||
@cache_on_self_and_args("ShapeAsConstantBuffer")
|
||||
def get_free_symbol_uses(
|
||||
self, unbacked_only: bool = False
|
||||
) -> OrderedSet[sympy.Symbol]:
|
||||
@ -4485,6 +4576,7 @@ class ComputedBuffer(OperationBuffer):
|
||||
self.data.get_size(),
|
||||
)
|
||||
|
||||
@cache_on_self_and_args("ComputedBuffer")
|
||||
def get_free_symbol_uses(
|
||||
self, unbacked_only: bool = False
|
||||
) -> OrderedSet[sympy.Symbol]:
|
||||
@ -4912,6 +5004,7 @@ class TritonTemplateBuffer(TemplateBuffer):
|
||||
self.subgraph_inps: Optional[list[Optional[Union[IRNode, sympy.Expr]]]] = None
|
||||
self.subgraph_outs: Optional[list[Optional[IRNode]]] = None
|
||||
|
||||
@cache_on_self_and_args("TritonTemplateBuffer")
|
||||
def get_free_symbol_uses(
|
||||
self, unbacked_only: bool = False
|
||||
) -> OrderedSet[sympy.Symbol]:
|
||||
@ -5264,6 +5357,7 @@ class InputsKernel(OperationBuffer):
|
||||
def num_reads(self) -> int:
|
||||
return 1
|
||||
|
||||
@cache_on_self_and_args("InputsKernel")
|
||||
def get_free_symbol_uses(
|
||||
self, unbacked_only: bool = False
|
||||
) -> OrderedSet[sympy.Symbol]:
|
||||
@ -5438,6 +5532,7 @@ class ConcatKernel(NopKernel):
|
||||
and not isinstance(src.data, ExternKernelAlloc)
|
||||
)
|
||||
|
||||
@cache_on_self_and_args("ConcatKernel")
|
||||
def get_free_symbol_uses(
|
||||
self, unbacked_only: bool = False
|
||||
) -> OrderedSet[sympy.Symbol]:
|
||||
@ -6337,6 +6432,7 @@ class ExternKernel(InputsKernel):
|
||||
index = sympy_subs(sympy.expand(index), replacement)
|
||||
return index, tuple(new_sizes)
|
||||
|
||||
@cache_on_self_and_args("ExternKernel")
|
||||
def get_free_symbol_uses(
|
||||
self, unbacked_only: bool = False
|
||||
) -> OrderedSet[sympy.Symbol]:
|
||||
@ -6797,6 +6893,7 @@ class UserDefinedTritonKernel(ExternKernel):
|
||||
original_fxnode_name=self.fx_node.name,
|
||||
)
|
||||
|
||||
@cache_on_self_and_args("UserDefinedTritonKernel")
|
||||
def get_free_symbol_uses(
|
||||
self, unbacked_only: bool = False
|
||||
) -> OrderedSet[sympy.Symbol]:
|
||||
@ -7265,6 +7362,7 @@ class DynamicSelectStorageOffset(ExternKernel):
|
||||
def get_unbacked_symbol_defs(self) -> OrderedSet[sympy.Symbol]:
|
||||
return OrderedSet([self.unbacked_offset_symbol])
|
||||
|
||||
@cache_on_self_and_args("DynamicSelectStorageOffset")
|
||||
def get_free_symbol_uses(
|
||||
self, unbacked_only: bool = False
|
||||
) -> OrderedSet[sympy.Symbol]:
|
||||
@ -7327,6 +7425,7 @@ class AssertScalar(ExternKernel):
|
||||
def has_side_effects(self) -> bool:
|
||||
return True
|
||||
|
||||
@cache_on_self_and_args("AssertScalar")
|
||||
def get_free_symbol_uses(
|
||||
self, unbacked_only: bool = False
|
||||
) -> OrderedSet[sympy.Symbol]:
|
||||
@ -7999,6 +8098,7 @@ class MultiOutput(ExternKernel):
|
||||
self.indices = indices
|
||||
self.skip_size_stride_alignment_checks = skip_size_stride_alignment_checks
|
||||
|
||||
@cache_on_self_and_args("MultiOutput")
|
||||
def get_free_symbol_uses(
|
||||
self, unbacked_only: bool = False
|
||||
) -> OrderedSet[sympy.Symbol]:
|
||||
@ -8121,6 +8221,7 @@ class MutableBox(IRNode):
|
||||
def realize(self) -> Optional[str]:
|
||||
return self.data.realize()
|
||||
|
||||
@cache_on_self_and_args("MutableBox")
|
||||
def get_free_symbol_uses(
|
||||
self, unbacked_only: bool = False
|
||||
) -> OrderedSet[sympy.Symbol]:
|
||||
@ -8919,6 +9020,7 @@ class EffectfulKernel(FallbackKernel):
|
||||
|
||||
|
||||
class NonTensorObj(IRNode):
|
||||
@cache_on_self_and_args("NonTensorObj")
|
||||
def get_free_symbol_uses(
|
||||
self, unbacked_only: bool = False
|
||||
) -> OrderedSet[sympy.Symbol]:
|
||||
|
||||
@ -208,9 +208,10 @@ def tuned_bmm(mat1, mat2, out_dtype=None, *, layout=None):
|
||||
)
|
||||
)
|
||||
|
||||
if use_triton_template(layout, check_max_autotune=False):
|
||||
if use_triton_template(layout, check_max_autotune=False) and (
|
||||
out_dtype is None or out_dtype == mat1.get_dtype()
|
||||
):
|
||||
# TODO: add out_dtype support for Triton Template
|
||||
assert out_dtype is None, "out_dtype is not supported for Triton"
|
||||
|
||||
choices.extend(
|
||||
V.choices.get_mm_configs(kernel_inputs, layout, [bmm_template], name)
|
||||
|
||||
@ -23,8 +23,6 @@ if TYPE_CHECKING:
|
||||
from collections.abc import Iterator, Sequence
|
||||
from types import ModuleType
|
||||
|
||||
import weakref
|
||||
|
||||
import sympy
|
||||
|
||||
import torch
|
||||
@ -94,28 +92,6 @@ _T = TypeVar("_T")
|
||||
_P = ParamSpec("_P")
|
||||
|
||||
|
||||
_custom_should_partition_fns: weakref.WeakKeyDictionary[
|
||||
torch._ops.OpOverload, Callable[..., bool]
|
||||
] = weakref.WeakKeyDictionary()
|
||||
|
||||
|
||||
def register_should_partition_rule(
|
||||
op: torch._ops.OpOverload,
|
||||
func: Callable[..., bool],
|
||||
) -> None:
|
||||
"""Register a function that says if Inductor should partition the graph on this op.
|
||||
|
||||
The function should be have the same signature as the operator.
|
||||
Inductor will invoke the function with FakeTensors when it needs to decide
|
||||
if the graph should be partitioned.
|
||||
|
||||
`register_should_partition_rule` is currently private and experimental.
|
||||
Use at your own risk.
|
||||
"""
|
||||
assert isinstance(op, torch._ops.OpOverload)
|
||||
_custom_should_partition_fns[op] = func
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class SchedulerBuffer:
|
||||
scheduler: Scheduler
|
||||
@ -3953,6 +3929,12 @@ class Scheduler:
|
||||
):
|
||||
return -1
|
||||
|
||||
# in some rare case, a template can be passed in.
|
||||
# Check test_interaction_with_multi_template in test_loop_ordering.py
|
||||
# and https://github.com/pytorch/pytorch/issues/165579
|
||||
if node1.is_template() or node2.is_template():
|
||||
return -1
|
||||
|
||||
node1_buffer_names = node1.read_writes.buffer_names()
|
||||
node2_buffer_names = node2.read_writes.buffer_names()
|
||||
# Fast path: no common buffers.
|
||||
@ -4654,21 +4636,21 @@ class Scheduler:
|
||||
# Allow users to manually specify if a node should be partitioned
|
||||
# Can only do this for FallbackKernels
|
||||
ir_node = node.node
|
||||
if isinstance(ir_node, torch._inductor.ir.FallbackKernel):
|
||||
operator = ir_node.op_overload
|
||||
if operator is not None and operator in _custom_should_partition_fns:
|
||||
assert isinstance(operator, torch._ops.OpOverload)
|
||||
should_partition_fn = _custom_should_partition_fns[operator]
|
||||
fx_node = ir_node.get_origin_node()
|
||||
assert fx_node is not None
|
||||
success, fake_args, fake_kwargs = (
|
||||
torch._inductor.fx_utils.get_fake_args_kwargs(fx_node)
|
||||
)
|
||||
assert success, (
|
||||
"If this op came from a custom inductor pass, make sure to run FakeTensorUpdator"
|
||||
)
|
||||
should_partition = should_partition_fn(*fake_args, **fake_kwargs)
|
||||
return should_partition
|
||||
if isinstance(ir_node, torch._inductor.ir.FallbackKernel) and (
|
||||
op := ir_node.op_overload
|
||||
):
|
||||
op_overload_packet_name = op.name()
|
||||
op_overload_name = (
|
||||
f"{op_overload_packet_name}.{op._overloadname}"
|
||||
if isinstance(op, torch._ops.OpOverload)
|
||||
else op_overload_packet_name
|
||||
)
|
||||
if (
|
||||
op_overload_packet_name in config.custom_should_partition_ops
|
||||
or op_overload_name in config.custom_should_partition_ops
|
||||
):
|
||||
assert isinstance(op, torch._ops.OpOverload)
|
||||
return True
|
||||
|
||||
# When not using cudagraphs, keep all kernels in the `call` function
|
||||
# instead of graph partition functions, since graph partition only brings
|
||||
@ -4944,6 +4926,16 @@ class Scheduler:
|
||||
for node in partition:
|
||||
buffer_names_to_free.update(node.last_usage)
|
||||
|
||||
# buffer_names_to_free may contain buffers allocated in previous
|
||||
# graph partitions. These buffers should also be a partition
|
||||
# input.
|
||||
extra_input_names = [
|
||||
name
|
||||
for name in (buffer_names_to_free - output_names)
|
||||
if name in name_to_node
|
||||
]
|
||||
partition_input_names.update(extra_input_names)
|
||||
|
||||
input_nodes = {
|
||||
name: name_to_node[name]
|
||||
for name in partition_input_names
|
||||
|
||||
@ -626,6 +626,7 @@ def tuple_sorted(x: tuple[_T, ...]) -> list[_T]:
|
||||
|
||||
P = ParamSpec("P")
|
||||
RV = TypeVar("RV", covariant=True)
|
||||
FN_TYPE = Callable[Concatenate[Any, P], RV]
|
||||
|
||||
|
||||
class CachedMethod(Protocol, Generic[P, RV]):
|
||||
@ -665,6 +666,60 @@ def cache_on_self(fn: Callable[Concatenate[Any, P], RV]) -> CachedMethod[P, RV]:
|
||||
return wrapper # type: ignore[return-value]
|
||||
|
||||
|
||||
def cache_property_on_self(fn: Callable[P, RV]) -> CachedMethod[P, RV]:
|
||||
"""
|
||||
Variant of cache_on_self for properties. The only difference is the type signature.
|
||||
"""
|
||||
# pyrefly: ignore [bad-argument-type]
|
||||
return cache_on_self(fn)
|
||||
|
||||
|
||||
def cache_on_self_and_args(
|
||||
class_name: str,
|
||||
) -> Callable[[FN_TYPE[P, RV]], FN_TYPE[P, RV]]:
|
||||
# include both class_name and fn_name in the key to support `super().fn(self, **args, **kwargs)` calls.
|
||||
|
||||
def wrapper(
|
||||
fn: FN_TYPE[P, RV],
|
||||
) -> FN_TYPE[P, RV]:
|
||||
key = f"__{class_name}_{fn.__name__}_cache"
|
||||
|
||||
# wrapper is likely on the hot path, compile a specialized version of it
|
||||
ctx = {"fn": fn}
|
||||
exec(
|
||||
f"""\
|
||||
def inner(self: Any, *args: P.args, **kwargs: P.kwargs) -> RV:
|
||||
args_kwargs = (args, tuple(sorted(kwargs.items())))
|
||||
|
||||
if not hasattr(self, "{key}"):
|
||||
object.__setattr__(self, "{key}", {{}})
|
||||
|
||||
cache = self.{key}
|
||||
|
||||
try:
|
||||
return cache[args_kwargs]
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
rv = fn(self, *args, **kwargs)
|
||||
|
||||
cache[args_kwargs] = rv
|
||||
return rv
|
||||
""".lstrip(),
|
||||
ctx,
|
||||
)
|
||||
inner = functools.wraps(fn)(ctx["inner"])
|
||||
|
||||
def clear_cache(self: Any) -> None:
|
||||
if hasattr(self, key):
|
||||
delattr(self, key)
|
||||
|
||||
inner.clear_cache = clear_cache # type: ignore[attr-defined]
|
||||
return inner
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def aggregate_origins(
|
||||
node_schedule: Union[Sequence[BaseSchedulerNode], ExternKernel],
|
||||
) -> OrderedSet[Node]:
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
// This file should only be compiled if this condition holds, so it should be
|
||||
// safe.
|
||||
#if defined(USE_CUDNN) || defined(USE_ROCM)
|
||||
#include <ATen/detail/CUDAHooksInterface.h>
|
||||
#include <torch/csrc/utils/pybind.h>
|
||||
|
||||
#include <tuple>
|
||||
@ -32,11 +33,7 @@ version_tuple getRuntimeVersion() {
|
||||
}
|
||||
|
||||
size_t getVersionInt() {
|
||||
#ifndef USE_STATIC_CUDNN
|
||||
return cudnnGetVersion();
|
||||
#else
|
||||
return CUDNN_VERSION;
|
||||
#endif
|
||||
return at::detail::getCUDAHooks().versionRuntimeCuDNN();
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
@ -1345,8 +1345,7 @@ class AsyncAllgatherWork : public ProcessGroupGloo::AsyncWork {
|
||||
// Use single flat output tensor.
|
||||
// The first dimension corresponds to the index into outputs[N],
|
||||
// so copying into the actual output later is easy.
|
||||
at::Tensor flatOutputTensor =
|
||||
newLikeFlat(outputs[0], /*preserve_strides*/ false);
|
||||
at::Tensor flatOutputTensor = newLikeFlat(outputs[0]);
|
||||
GENERATE_ALL_TYPES(scalarType, setOutput, opts, flatOutputTensor);
|
||||
gloo::allgather(opts);
|
||||
|
||||
@ -1363,7 +1362,7 @@ class AsyncAllgatherWork : public ProcessGroupGloo::AsyncWork {
|
||||
}
|
||||
|
||||
const std::vector<at::Tensor> getOutputTensors() override {
|
||||
return {newLikeFlat(outputs[0], /*preserve_strides*/ false)};
|
||||
return {newLikeFlat(outputs[0])};
|
||||
}
|
||||
|
||||
void run() override {
|
||||
@ -1659,7 +1658,7 @@ class AsyncAllgatherCoalescedWork : public ProcessGroupGloo::AsyncWork {
|
||||
}
|
||||
|
||||
const std::vector<at::Tensor> getOutputTensors() override {
|
||||
return {newLikeFlat(output_lists[0], /*preserve_strides*/ false)};
|
||||
return {newLikeFlat(output_lists[0])};
|
||||
}
|
||||
|
||||
void run() override {
|
||||
@ -1783,7 +1782,7 @@ class AsyncGatherWork : public ProcessGroupGloo::AsyncWork {
|
||||
// This is later scattered to the separate output tensors.
|
||||
at::Tensor flatOutputTensor;
|
||||
if (context_->rank == root) {
|
||||
flatOutputTensor = newLikeFlat(outputs[0], /*preserve_strides*/ false);
|
||||
flatOutputTensor = newLikeFlat(outputs[0]);
|
||||
GENERATE_ALL_TYPES(scalarType, setOutput, opts, flatOutputTensor);
|
||||
}
|
||||
|
||||
@ -1806,8 +1805,7 @@ class AsyncGatherWork : public ProcessGroupGloo::AsyncWork {
|
||||
|
||||
const std::vector<at::Tensor> getOutputTensors() override {
|
||||
return outputs.empty() ? std::vector<at::Tensor>{}
|
||||
: std::vector<at::Tensor>{newLikeFlat(
|
||||
outputs[0], /*preserve_strides*/ false)};
|
||||
: std::vector<at::Tensor>{newLikeFlat(outputs[0])};
|
||||
}
|
||||
|
||||
void run() override {
|
||||
@ -2023,8 +2021,7 @@ class AsyncScatterWork : public ProcessGroupGloo::AsyncWork {
|
||||
|
||||
const std::vector<at::Tensor> getInputTensors() override {
|
||||
return inputs.empty() ? std::vector<at::Tensor>{}
|
||||
: std::vector<at::Tensor>{newLikeFlat(
|
||||
inputs[0], /*preserve_strides*/ false)};
|
||||
: std::vector<at::Tensor>{newLikeFlat(inputs[0])};
|
||||
}
|
||||
|
||||
const std::vector<at::Tensor> getOutputTensors() override {
|
||||
|
||||
@ -4711,6 +4711,9 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather(
|
||||
bool same_size = check_same_size(outputTensors_);
|
||||
if (same_size) {
|
||||
// Flatten a vector of tensors into a single, stacked tensor.
|
||||
// we can handle only contiguous inputs, because we are
|
||||
// just sending ptr and numel to nccl
|
||||
inputTensor = inputTensor.contiguous();
|
||||
at::Tensor outputFlattened = newLikeFlat(outputTensors_);
|
||||
|
||||
return collective(
|
||||
@ -4858,6 +4861,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter(
|
||||
bool same_size = check_same_size(inputTensors_);
|
||||
if (same_size) {
|
||||
// Flatten a vector of tensors into a single, stacked tensor.
|
||||
outputTensor = outputTensor.contiguous();
|
||||
at::Tensor inputFlattened = newLikeFlat(inputTensors_);
|
||||
|
||||
return collective(
|
||||
|
||||
@ -444,9 +444,7 @@ inline at::Tensor newLikeFlat(
|
||||
sizes, strides, t.options().memory_format(std::nullopt));
|
||||
}
|
||||
|
||||
inline at::Tensor newLikeFlat(
|
||||
std::vector<at::Tensor>& tensors,
|
||||
bool preserve_strides = true) {
|
||||
inline at::Tensor newLikeFlat(std::vector<at::Tensor>& tensors) {
|
||||
if (tensors.empty()) {
|
||||
TORCH_CHECK(false, "Received an empty list");
|
||||
}
|
||||
@ -454,20 +452,7 @@ inline at::Tensor newLikeFlat(
|
||||
at::DeviceGuard gpuGuard(t.device());
|
||||
std::vector<int64_t> sizes{static_cast<int64_t>(tensors.size())};
|
||||
sizes.insert(sizes.end(), t.sizes().begin(), t.sizes().end());
|
||||
if (t.is_contiguous() ||
|
||||
!preserve_strides) { // we are checking for memory format, so tensor might
|
||||
// not be contiguous
|
||||
// TODO handle all non-overlapping-and-dense, although if the strides
|
||||
// disagree in ranks we are opening a door for more bugs than currently
|
||||
// where channels-last might disagree between ranks
|
||||
// fast path, don't call empty_strided
|
||||
return at::empty(sizes, t.options());
|
||||
} else {
|
||||
// memory-dense, but not necessarily contiguous tensor
|
||||
std::vector<int64_t> strides{t.numel()};
|
||||
strides.insert(strides.end(), t.strides().begin(), t.strides().end());
|
||||
return at::empty_strided(sizes, strides, t.options());
|
||||
}
|
||||
return at::empty(sizes, t.options());
|
||||
}
|
||||
|
||||
inline std::vector<std::vector<int64_t>> getSizes(
|
||||
|
||||
@ -450,7 +450,7 @@ lib.define(
|
||||
lib.define(
|
||||
"fused_scaled_matmul_reduce_scatter("
|
||||
"Tensor A, Tensor B, Tensor A_scale, Tensor B_scale, "
|
||||
"str reduce_op, int orig_scatter_dim, int scatter_dim_after_maybe_reshape, str group_name, int[]? output_shape, "
|
||||
"str reduce_op, int orig_scatter_dim, int scatter_dim_after_maybe_reshape, str group_name, SymInt[]? output_shape, "
|
||||
"Tensor? bias = None, "
|
||||
"Tensor? result_scale = None, "
|
||||
"ScalarType? out_dtype = None, "
|
||||
|
||||
7
torch/headeronly/README.md
Normal file
7
torch/headeronly/README.md
Normal file
@ -0,0 +1,7 @@
|
||||
## torch/headeronly
|
||||
|
||||
The inlined C++ headers in the `torch::headeronly` namespace living this subdirectory are completely decoupled from LibTorch. These APIs are also globally listed in [torch/header_only_apis.txt](https://github.com/pytorch/pytorch/blob/main/torch/header_only_apis.txt).
|
||||
|
||||
There are two types of LibTorch independent header-only headers:
|
||||
1. OG header-only. Originally header-only APIs, such as `ScalarType`, `Half`, `BFloat16`, have always been implemented in headers only. For them to move into torch/headeronly only required a code migration, a copy-pasta, if you will.
|
||||
2. Made to be header-only. There are also APIs that were NOT header-only that we made to be header-only. One example of such an API is `STD_TORCH_CHECK`, which was derived from `TORCH_CHECK`. `STD_TORCH_CHECK` calls into `std::runtime_error` instead of relying on `c10::Error`, which relies on libtorch.so. As a result, `STD_TORCH_CHECK` does not have the full `TORCH_CHECK` functionality that displays a fanciful traceback when the check is not met. We intentionally maintain the design that functions that do different things should be explicitly named differently.
|
||||
@ -1 +1 @@
|
||||
2.9.0a0
|
||||
2.9.1a0
|
||||
|
||||
Reference in New Issue
Block a user