mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-02 06:24:59 +08:00
Compare commits
144 Commits
ciflow/tru
...
bench_scal
| Author | SHA1 | Date | |
|---|---|---|---|
| 75b8295868 | |||
| defb6a80d8 | |||
| f8fccb1e48 | |||
| 5aac4cfce4 | |||
| baf91bbbfc | |||
| cbcb4f7768 | |||
| 2b93d5b450 | |||
| 6b7cd48e7e | |||
| bf5aa9e42e | |||
| b1eb6dede5 | |||
| 673060beae | |||
| 2e8e9a59a8 | |||
| fb277a5916 | |||
| 73fa0d0c63 | |||
| 36c21cc84e | |||
| 0b68814b44 | |||
| e64a814ae7 | |||
| 0b58d87aec | |||
| 757975ad50 | |||
| 291712026b | |||
| 3e77a2b478 | |||
| 82ef1b5db3 | |||
| 5f370f5c42 | |||
| 05b2e02cb4 | |||
| 12f742941d | |||
| 35180fafee | |||
| c746feb86a | |||
| c5f26db5bf | |||
| 18e99b6d45 | |||
| ab9e466928 | |||
| af4ba78543 | |||
| 282f39a4bc | |||
| a479769488 | |||
| 26c7375477 | |||
| d01f15152c | |||
| 4fae6968b1 | |||
| f9953e0f61 | |||
| 34ed7a8f0d | |||
| 2fde10d914 | |||
| 0a93295da0 | |||
| 4b898b51b9 | |||
| 550e3e6efb | |||
| 715449ca76 | |||
| 84d8d06fc3 | |||
| 60992d98b2 | |||
| 59e015e3a1 | |||
| 8904a5a7c9 | |||
| f5df9ca03a | |||
| 2998abd777 | |||
| e13580e41c | |||
| f3b8e15f20 | |||
| 5211f4c108 | |||
| ad9027b80d | |||
| a1005427bf | |||
| 35153d0846 | |||
| 7773a22cdb | |||
| 7cb467a169 | |||
| 12aac12b8d | |||
| 2b748d0a56 | |||
| 16745a882a | |||
| 8daef35cf1 | |||
| 51319ca090 | |||
| d311a3d1dc | |||
| 04adfe5ba9 | |||
| 4be1e3bf92 | |||
| e7592f4005 | |||
| d334c3649d | |||
| 9f82535c5a | |||
| 5b35fc8777 | |||
| 2f38eece7c | |||
| 830e789a55 | |||
| ad4dc52bf6 | |||
| dac9ed9790 | |||
| 1c7fe8f861 | |||
| 4e643422f6 | |||
| 3c3b278872 | |||
| 0bd12c1168 | |||
| ce8a7764e2 | |||
| d1269a0434 | |||
| c87cf1be32 | |||
| 2fc5e45a41 | |||
| f9022ba93b | |||
| ff8be889ad | |||
| 292454942e | |||
| 6c4412f72b | |||
| 78bf6186f2 | |||
| c40048472c | |||
| 3dfd0c7584 | |||
| e6ba4d0725 | |||
| bdf7cb9d9c | |||
| 6aed378958 | |||
| 8b3dc0d1b0 | |||
| 06773663b5 | |||
| 0bff65503c | |||
| 21131a2444 | |||
| 1009790ad8 | |||
| 410e6a4321 | |||
| 23c55c5b66 | |||
| 1290b077f2 | |||
| 9f9ab881b2 | |||
| f2bb22ff84 | |||
| 03f3f7899c | |||
| 771170807b | |||
| ffa90d46e6 | |||
| 0e083942cc | |||
| ce1fcff03e | |||
| a238a9a100 | |||
| fe69a2bbbd | |||
| 0be0de4ffa | |||
| 7406d2e665 | |||
| 303c9cf048 | |||
| d7d4bb7c51 | |||
| 0b1c462979 | |||
| 4a6cf0a93e | |||
| 4c963a68d7 | |||
| b20deec3d1 | |||
| 51d0d8ee67 | |||
| 70592c6819 | |||
| 259cb945f5 | |||
| e20c9bf288 | |||
| 99c8640b5d | |||
| 96b0e7aaa6 | |||
| 850ba8c96d | |||
| 1bcd736f91 | |||
| df64c0c464 | |||
| 1891239a1d | |||
| cf280ca1e8 | |||
| efc277cac7 | |||
| 4f7f43253d | |||
| 779296a3fc | |||
| 8f06a1308f | |||
| 240c13394e | |||
| 150682ba7f | |||
| ca7360e996 | |||
| 0bf604320f | |||
| 9875e70da8 | |||
| 69a4bfe8bb | |||
| 62a263b8d4 | |||
| 0da1f911dc | |||
| 8700d68fef | |||
| ab82456c16 | |||
| b23f4687fd | |||
| 2705937080 | |||
| c1eda348be |
@ -19,7 +19,7 @@ pip_install \
|
||||
transformers==4.36.2
|
||||
|
||||
pip_install coloredlogs packaging
|
||||
pip_install onnxruntime==1.23.0
|
||||
pip_install onnxruntime==1.23.1
|
||||
pip_install onnxscript==0.5.4
|
||||
|
||||
# Cache the transformers model to be used later by ONNX tests. We need to run the transformers
|
||||
|
||||
@ -334,12 +334,12 @@ sympy==1.13.3
|
||||
#Pinned versions:
|
||||
#test that import:
|
||||
|
||||
onnx==1.18.0
|
||||
onnx==1.19.1
|
||||
#Description: Required by onnx tests, and mypy and test_public_bindings.py when checking torch.onnx._internal
|
||||
#Pinned versions:
|
||||
#test that import:
|
||||
|
||||
onnxscript==0.5.3
|
||||
onnxscript==0.5.4
|
||||
#Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
|
||||
#Pinned versions:
|
||||
#test that import:
|
||||
|
||||
@ -6,7 +6,7 @@ dependencies = [
|
||||
"GitPython==3.1.45",
|
||||
"docker==7.1.0",
|
||||
"pytest==7.3.2",
|
||||
"uv==0.8.6"
|
||||
"uv==0.9.5"
|
||||
]
|
||||
|
||||
[tool.setuptools]
|
||||
|
||||
@ -163,8 +163,13 @@ if [[ "$(uname)" != Darwin ]]; then
|
||||
MEMORY_LIMIT_MAX_JOBS=12
|
||||
NUM_CPUS=$(( $(nproc) - 2 ))
|
||||
|
||||
# Defaults here for **binary** linux builds so they can be changed in one place
|
||||
export MAX_JOBS=${MAX_JOBS:-$(( ${NUM_CPUS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${NUM_CPUS} ))}
|
||||
if [[ "$(uname)" == Linux ]]; then
|
||||
# Defaults here for **binary** linux builds so they can be changed in one place
|
||||
export MAX_JOBS=${MAX_JOBS:-$(( ${NUM_CPUS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${NUM_CPUS} ))}
|
||||
else
|
||||
# For other builds
|
||||
export MAX_JOBS=${NUM_CPUS}
|
||||
fi
|
||||
|
||||
cat >>"$envfile" <<EOL
|
||||
export MAX_JOBS="${MAX_JOBS}"
|
||||
|
||||
2
.github/ci_commit_pins/vision.txt
vendored
2
.github/ci_commit_pins/vision.txt
vendored
@ -1 +1 @@
|
||||
faffd5cf673615583da6517275e361cb3dbc77e6
|
||||
1752fe6809b74921644866275ab80244b96e80bc
|
||||
|
||||
1
.github/pytorch-probot.yml
vendored
1
.github/pytorch-probot.yml
vendored
@ -33,6 +33,7 @@ ciflow_push_tags:
|
||||
- ciflow/rocm
|
||||
- ciflow/rocm-mi300
|
||||
- ciflow/rocm-mi355
|
||||
- ciflow/rocm-navi31
|
||||
- ciflow/s390
|
||||
- ciflow/slow
|
||||
- ciflow/torchbench
|
||||
|
||||
@ -79,9 +79,9 @@ jobs:
|
||||
runs-on: "windows-11-arm64-preview"
|
||||
{%- else %}
|
||||
{%- if branches == "nightly" %}
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
{%- else %}
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge.nonephemeral"
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
timeout-minutes: !{{ common.timeout_minutes_windows_binary }}
|
||||
|
||||
8
.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
generated
vendored
8
.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
generated
vendored
@ -44,7 +44,7 @@ jobs:
|
||||
libtorch-cpu-shared-with-deps-debug-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -291,7 +291,7 @@ jobs:
|
||||
libtorch-cuda12_6-shared-with-deps-debug-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -541,7 +541,7 @@ jobs:
|
||||
libtorch-cuda12_8-shared-with-deps-debug-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -791,7 +791,7 @@ jobs:
|
||||
libtorch-cuda13_0-shared-with-deps-debug-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
|
||||
8
.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
generated
vendored
8
.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
generated
vendored
@ -44,7 +44,7 @@ jobs:
|
||||
libtorch-cpu-shared-with-deps-release-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -291,7 +291,7 @@ jobs:
|
||||
libtorch-cuda12_6-shared-with-deps-release-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -541,7 +541,7 @@ jobs:
|
||||
libtorch-cuda12_8-shared-with-deps-release-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -791,7 +791,7 @@ jobs:
|
||||
libtorch-cuda13_0-shared-with-deps-release-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
|
||||
70
.github/workflows/generated-windows-binary-wheel-nightly.yml
generated
vendored
70
.github/workflows/generated-windows-binary-wheel-nightly.yml
generated
vendored
@ -44,7 +44,7 @@ jobs:
|
||||
wheel-py3_10-cpu-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -279,7 +279,7 @@ jobs:
|
||||
wheel-py3_10-cuda12_6-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -517,7 +517,7 @@ jobs:
|
||||
wheel-py3_10-cuda12_8-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -755,7 +755,7 @@ jobs:
|
||||
wheel-py3_10-cuda13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -993,7 +993,7 @@ jobs:
|
||||
wheel-py3_10-xpu-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -1229,7 +1229,7 @@ jobs:
|
||||
wheel-py3_11-cpu-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -1464,7 +1464,7 @@ jobs:
|
||||
wheel-py3_11-cuda12_6-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -1702,7 +1702,7 @@ jobs:
|
||||
wheel-py3_11-cuda12_8-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -1940,7 +1940,7 @@ jobs:
|
||||
wheel-py3_11-cuda13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -2178,7 +2178,7 @@ jobs:
|
||||
wheel-py3_11-xpu-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -2414,7 +2414,7 @@ jobs:
|
||||
wheel-py3_12-cpu-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -2649,7 +2649,7 @@ jobs:
|
||||
wheel-py3_12-cuda12_6-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -2887,7 +2887,7 @@ jobs:
|
||||
wheel-py3_12-cuda12_8-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -3125,7 +3125,7 @@ jobs:
|
||||
wheel-py3_12-cuda13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -3363,7 +3363,7 @@ jobs:
|
||||
wheel-py3_12-xpu-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -3599,7 +3599,7 @@ jobs:
|
||||
wheel-py3_13-cpu-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -3834,7 +3834,7 @@ jobs:
|
||||
wheel-py3_13-cuda12_6-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -4072,7 +4072,7 @@ jobs:
|
||||
wheel-py3_13-cuda12_8-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -4310,7 +4310,7 @@ jobs:
|
||||
wheel-py3_13-cuda13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -4548,7 +4548,7 @@ jobs:
|
||||
wheel-py3_13-xpu-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -4784,7 +4784,7 @@ jobs:
|
||||
wheel-py3_13t-cpu-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -5019,7 +5019,7 @@ jobs:
|
||||
wheel-py3_13t-cuda12_6-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -5257,7 +5257,7 @@ jobs:
|
||||
wheel-py3_13t-cuda12_8-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -5495,7 +5495,7 @@ jobs:
|
||||
wheel-py3_13t-cuda13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -5733,7 +5733,7 @@ jobs:
|
||||
wheel-py3_13t-xpu-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -5969,7 +5969,7 @@ jobs:
|
||||
wheel-py3_14-cpu-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -6204,7 +6204,7 @@ jobs:
|
||||
wheel-py3_14-cuda12_6-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -6442,7 +6442,7 @@ jobs:
|
||||
wheel-py3_14-cuda12_8-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -6680,7 +6680,7 @@ jobs:
|
||||
wheel-py3_14-cuda13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -6918,7 +6918,7 @@ jobs:
|
||||
wheel-py3_14-xpu-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -7154,7 +7154,7 @@ jobs:
|
||||
wheel-py3_14t-cpu-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -7389,7 +7389,7 @@ jobs:
|
||||
wheel-py3_14t-cuda12_6-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -7627,7 +7627,7 @@ jobs:
|
||||
wheel-py3_14t-cuda12_8-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -7865,7 +7865,7 @@ jobs:
|
||||
wheel-py3_14t-cuda13_0-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
@ -8103,7 +8103,7 @@ jobs:
|
||||
wheel-py3_14t-xpu-build:
|
||||
if: ${{ github.repository_owner == 'pytorch' }}
|
||||
needs: get-label-type
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
|
||||
runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
|
||||
timeout-minutes: 360
|
||||
env:
|
||||
PYTORCH_ROOT: ${{ github.workspace }}/pytorch
|
||||
|
||||
15
.github/workflows/periodic.yml
vendored
15
.github/workflows/periodic.yml
vendored
@ -147,15 +147,16 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-debug
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9
|
||||
cuda-arch-list: 8.9
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] },
|
||||
{ config: "default", shard: 2, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] },
|
||||
{ config: "default", shard: 3, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] },
|
||||
{ config: "default", shard: 4, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] },
|
||||
{ config: "default", shard: 5, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] },
|
||||
{ config: "default", shard: 6, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] },
|
||||
{ config: "default", shard: 7, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] },
|
||||
{ config: "default", shard: 1, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
|
||||
{ config: "default", shard: 2, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
|
||||
{ config: "default", shard: 3, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
|
||||
{ config: "default", shard: 4, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
|
||||
{ config: "default", shard: 5, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
|
||||
{ config: "default", shard: 6, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
|
||||
{ config: "default", shard: 7, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
63
.github/workflows/rocm-navi31.yml
vendored
Normal file
63
.github/workflows/rocm-navi31.yml
vendored
Normal file
@ -0,0 +1,63 @@
|
||||
name: rocm-navi31
|
||||
|
||||
on:
|
||||
push:
|
||||
tags:
|
||||
- ciflow/rocm-navi31/*
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
# We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs.
|
||||
# Also run less frequently on weekends.
|
||||
- cron: 45 */2 * * 1-5
|
||||
- cron: 45 4,12 * * 0,6
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions: read-all
|
||||
|
||||
jobs:
|
||||
target-determination:
|
||||
if: github.repository_owner == 'pytorch'
|
||||
name: before-test
|
||||
uses: ./.github/workflows/target_determination.yml
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
|
||||
linux-jammy-rocm-py3_10-build:
|
||||
if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
|
||||
name: linux-jammy-rocm-py3.10
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
build-environment: linux-jammy-rocm-py3.10
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
|
||||
sync-tag: rocm-build
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
|
||||
{ config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-rocm-py3_10-test:
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
name: linux-jammy-rocm-py3_10
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs:
|
||||
- linux-jammy-rocm-py3_10-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-jammy-rocm-py3.10
|
||||
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
|
||||
tests-to-include: >-
|
||||
${{ github.event_name == 'schedule' && 'test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs
|
||||
test_autograd inductor/test_torchinductor inductor/test_kernel_benchmark
|
||||
inductor/test_pad_mm inductor/test_benchmark_fusion inductor/test_aot_inductor
|
||||
inductor/test_torchinductor inductor/test_decompose_mem_bound_mm
|
||||
inductor/test_flex_attention inductor/test_max_autotune' || '' }}
|
||||
secrets: inherit
|
||||
26
.github/workflows/rocm.yml
vendored
26
.github/workflows/rocm.yml
vendored
@ -59,29 +59,3 @@ jobs:
|
||||
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-rocm-py3_10-gfx1100-test:
|
||||
if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
name: linux-jammy-rocm-py3_10-gfx1100
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs:
|
||||
- linux-jammy-rocm-py3_10-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-jammy-rocm-py3.10
|
||||
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
|
||||
{ config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
|
||||
]}
|
||||
tests-to-include: >
|
||||
test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs
|
||||
test_autograd inductor/test_torchinductor inductor/test_kernel_benchmark
|
||||
inductor/test_pad_mm inductor/test_benchmark_fusion inductor/test_aot_inductor
|
||||
inductor/test_torchinductor inductor/test_decompose_mem_bound_mm
|
||||
inductor/test_flex_attention inductor/test_max_autotune
|
||||
secrets: inherit
|
||||
|
||||
147
.github/workflows/trunk-tagging.yml
vendored
147
.github/workflows/trunk-tagging.yml
vendored
@ -58,8 +58,10 @@ jobs:
|
||||
else
|
||||
COMMIT_SHA="${{ github.sha }}"
|
||||
fi
|
||||
echo "sha=${COMMIT_SHA}" >> "${GITHUB_OUTPUT}"
|
||||
echo "tag_name=trunk/${COMMIT_SHA}" >> "${GITHUB_OUTPUT}"
|
||||
{
|
||||
echo "sha=${COMMIT_SHA}"
|
||||
echo "tag_name=trunk/${COMMIT_SHA}"
|
||||
} >> "${GITHUB_OUTPUT}"
|
||||
|
||||
- name: Validate commit SHA
|
||||
run: |
|
||||
@ -87,7 +89,7 @@ jobs:
|
||||
echo "✅ Commit ${COMMIT_SHA} is valid (automatic push trigger)"
|
||||
fi
|
||||
|
||||
- name: Create and push tag with retry
|
||||
- name: Create and push tag(s) with retry
|
||||
id: check_tag
|
||||
env:
|
||||
TAG_NAME: ${{ steps.commit.outputs.tag_name }}
|
||||
@ -112,14 +114,23 @@ jobs:
|
||||
return 1
|
||||
}
|
||||
|
||||
# Exit early if tag already exists
|
||||
if check_tag_exists; then
|
||||
echo "✅ Tag already exists - no action needed"
|
||||
echo "exists=true" >> "${GITHUB_OUTPUT}"
|
||||
exit 0
|
||||
fi
|
||||
# Counters for summary reporting
|
||||
created_count=0
|
||||
skipped_count=0
|
||||
failed_count=0
|
||||
|
||||
echo "Tag ${TAG_NAME} does not exist, proceeding with creation"
|
||||
# Always write outputs once on exit
|
||||
finish() {
|
||||
set +e
|
||||
if [ -n "${GITHUB_OUTPUT:-}" ]; then
|
||||
{
|
||||
echo "created_count=${created_count}"
|
||||
echo "skipped_count=${skipped_count}"
|
||||
echo "failed_count=${failed_count}"
|
||||
} >> "${GITHUB_OUTPUT}"
|
||||
fi
|
||||
}
|
||||
trap finish EXIT
|
||||
|
||||
# Retry configuration
|
||||
MAX_RETRIES=5
|
||||
@ -194,31 +205,111 @@ jobs:
|
||||
}
|
||||
}
|
||||
|
||||
# Execute with retry
|
||||
if retry_with_backoff "tag_with_retry" "Creating tag ${TAG_NAME} for commit ${COMMIT_SHA}"; then
|
||||
echo "exists=false" >> "${GITHUB_OUTPUT}"
|
||||
# New behavior for push events: enumerate commits in the push and tag each one.
|
||||
# For workflow_dispatch, retain existing single-SHA behavior.
|
||||
|
||||
# Always fetch tags once up front to improve idempotency in loops
|
||||
git fetch origin --tags --quiet || true
|
||||
|
||||
if [ "${{ github.event_name }}" = "push" ]; then
|
||||
BEFORE_SHA="${{ github.event.before }}"
|
||||
AFTER_SHA="${{ github.sha }}" # same as event.after
|
||||
|
||||
# List commits introduced by this push (old..new), oldest first for stable ordering
|
||||
commits_file="$(mktemp)"
|
||||
git rev-list --reverse "${BEFORE_SHA}..${AFTER_SHA}" > "${commits_file}"
|
||||
|
||||
if [ ! -s "${commits_file}" ]; then
|
||||
echo "No new commits found between ${BEFORE_SHA}..${AFTER_SHA}; nothing to tag."
|
||||
rm -f "${commits_file}"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
commit_count="$(wc -l < "${commits_file}" | tr -d ' ')"
|
||||
echo "Found ${commit_count} commit(s) to tag for push:"
|
||||
while IFS= read -r sha; do
|
||||
printf ' %s\n' "${sha}"
|
||||
done < "${commits_file}"
|
||||
|
||||
while IFS= read -r sha; do
|
||||
TAG_NAME="trunk/${sha}"
|
||||
COMMIT_SHA="${sha}"
|
||||
|
||||
# If tag already exists locally or remotely, skip (idempotent)
|
||||
if check_tag_exists; then
|
||||
echo "✅ Tag ${TAG_NAME} already exists - skipping"
|
||||
skipped_count=$((skipped_count + 1))
|
||||
continue
|
||||
fi
|
||||
|
||||
echo "Tag ${TAG_NAME} does not exist, proceeding with creation"
|
||||
|
||||
if retry_with_backoff "tag_with_retry" "Creating tag ${TAG_NAME} for commit ${COMMIT_SHA}"; then
|
||||
created_count=$((created_count + 1))
|
||||
else
|
||||
echo "Tag creation failed after all retry attempts for ${TAG_NAME}"
|
||||
failed_count=$((failed_count + 1))
|
||||
fi
|
||||
done < "${commits_file}"
|
||||
|
||||
rm -f "${commits_file}"
|
||||
|
||||
if [ "${failed_count}" -gt 0 ]; then
|
||||
exit 1
|
||||
fi
|
||||
exit 0
|
||||
else
|
||||
echo "Tag creation failed after all retry attempts"
|
||||
exit 1
|
||||
# workflow_dispatch path (single SHA tagging preserved)
|
||||
|
||||
# Exit early if tag already exists
|
||||
if check_tag_exists; then
|
||||
echo "✅ Tag already exists - no action needed"
|
||||
skipped_count=1
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Tag ${TAG_NAME} does not exist, proceeding with creation"
|
||||
|
||||
if retry_with_backoff "tag_with_retry" "Creating tag ${TAG_NAME} for commit ${COMMIT_SHA}"; then
|
||||
created_count=1
|
||||
exit 0
|
||||
else
|
||||
echo "Tag creation failed after all retry attempts"
|
||||
failed_count=1
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
- name: Tag creation summary
|
||||
if: always()
|
||||
run: |
|
||||
if [ "${{ steps.check_tag.outputs.exists }}" = "true" ]; then
|
||||
echo "✅ Tag ${{ steps.commit.outputs.tag_name }} already existed - no action needed"
|
||||
elif [ "${{ job.status }}" = "success" ]; then
|
||||
echo "✅ Successfully created tag ${{ steps.commit.outputs.tag_name }} for commit ${{ steps.commit.outputs.sha }}"
|
||||
if [ "${{ github.event_name }}" = "push" ]; then
|
||||
echo "Trigger: push on main"
|
||||
echo "Created: ${{ steps.check_tag.outputs.created_count }}"
|
||||
echo "Skipped (already existed): ${{ steps.check_tag.outputs.skipped_count }}"
|
||||
echo "Failed: ${{ steps.check_tag.outputs.failed_count }}"
|
||||
if [ "${{ steps.check_tag.outputs.failed_count }}" = "0" ]; then
|
||||
echo "✅ Completed tagging for push range ${{ github.event.before }}..${{ github.sha }}"
|
||||
else
|
||||
echo "❌ Some tags failed to create for push range ${{ github.event.before }}..${{ github.sha }}"
|
||||
fi
|
||||
else
|
||||
echo "❌ Failed to create tag ${{ steps.commit.outputs.tag_name }} for commit ${{ steps.commit.outputs.sha }}"
|
||||
fi
|
||||
if [ "${{ steps.check_tag.outputs.failed_count }}" = "0" ]; then
|
||||
if [ "${{ steps.check_tag.outputs.created_count }}" = "0" ]; then
|
||||
echo "✅ Tag ${{ steps.commit.outputs.tag_name }} already existed - no action needed"
|
||||
else
|
||||
echo "✅ Successfully created tag ${{ steps.commit.outputs.tag_name }} for commit ${{ steps.commit.outputs.sha }}"
|
||||
fi
|
||||
else
|
||||
echo "❌ Failed to create tag ${{ steps.commit.outputs.tag_name }} for commit ${{ steps.commit.outputs.sha }}"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "Tag details:"
|
||||
echo " Name: ${{ steps.commit.outputs.tag_name }}"
|
||||
echo " Commit: ${{ steps.commit.outputs.sha }}"
|
||||
echo " Trigger: ${{ github.event_name }}"
|
||||
if [ -n "${{ github.event.inputs.commit_sha }}" ]; then
|
||||
echo " Manual commit: ${{ github.event.inputs.commit_sha }}"
|
||||
echo ""
|
||||
echo "Tag details:"
|
||||
echo " Name: ${{ steps.commit.outputs.tag_name }}"
|
||||
echo " Commit: ${{ steps.commit.outputs.sha }}"
|
||||
echo " Trigger: ${{ github.event_name }}"
|
||||
if [ -n "${{ github.event.inputs.commit_sha }}" ]; then
|
||||
echo " Manual commit: ${{ github.event.inputs.commit_sha }}"
|
||||
fi
|
||||
fi
|
||||
|
||||
34
.github/workflows/trunk.yml
vendored
34
.github/workflows/trunk.yml
vendored
@ -190,6 +190,40 @@ jobs:
|
||||
runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-rocm-py3_10-build:
|
||||
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
|
||||
name: linux-jammy-rocm-py3.10
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-rocm-py3.10
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
|
||||
sync-tag: rocm-build
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
|
||||
{ config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-rocm-py3_10-test:
|
||||
if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
name: linux-jammy-rocm-py3.10
|
||||
uses: ./.github/workflows/_rocm-test.yml
|
||||
needs:
|
||||
- linux-jammy-rocm-py3_10-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-jammy-rocm-py3.10
|
||||
docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
|
||||
tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
|
||||
secrets: inherit
|
||||
|
||||
inductor-build:
|
||||
name: inductor-build
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
|
||||
@ -314,13 +314,14 @@ IF(USE_FBGEMM_GENAI)
|
||||
|
||||
# Add additional HIPCC compiler flags for performance
|
||||
set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
|
||||
-mllvm
|
||||
-amdgpu-coerce-illegal-types=1
|
||||
-mllvm
|
||||
-enable-post-misched=0
|
||||
-mllvm
|
||||
-greedy-reverse-local-assignment=1
|
||||
-fhip-new-launch-api)
|
||||
if(DEFINED ROCM_VERSION_DEV AND ROCM_VERSION_DEV VERSION_LESS "7.2.0")
|
||||
list(PREPEND FBGEMM_GENAI_EXTRA_HIPCC_FLAGS -mllvm -amdgpu-coerce-illegal-types=1)
|
||||
endif()
|
||||
|
||||
# Only compile for gfx942 for now.
|
||||
# This is rather hacky, I could not figure out a clean solution :(
|
||||
|
||||
@ -19,6 +19,7 @@
|
||||
#include <ATen/detail/MPSHooksInterface.h>
|
||||
#include <ATen/detail/MTIAHooksInterface.h>
|
||||
#include <ATen/detail/PrivateUse1HooksInterface.h>
|
||||
#include <ATen/detail/XLAHooksInterface.h>
|
||||
#include <ATen/detail/XPUHooksInterface.h>
|
||||
#include <c10/core/QEngine.h>
|
||||
#include <c10/core/impl/DeviceGuardImplInterface.h>
|
||||
@ -88,6 +89,8 @@ class TORCH_API Context {
|
||||
return at::detail::getHIPHooks();
|
||||
} else if (opt_device_type == at::kHPU) {
|
||||
return at::detail::getHPUHooks();
|
||||
} else if (opt_device_type == at::kXLA) {
|
||||
return at::detail::getXLAHooks();
|
||||
} else {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
@ -196,7 +199,7 @@ class TORCH_API Context {
|
||||
return c10::impl::hasDeviceGuardImpl(c10::DeviceType::IPU);
|
||||
}
|
||||
static bool hasXLA() {
|
||||
return c10::impl::hasDeviceGuardImpl(c10::DeviceType::XLA);
|
||||
return detail::getXLAHooks().hasXLA();
|
||||
}
|
||||
static bool hasXPU() {
|
||||
return detail::getXPUHooks().hasXPU();
|
||||
|
||||
@ -39,7 +39,7 @@ struct HostBlock {
|
||||
};
|
||||
|
||||
template <typename B>
|
||||
struct alignas(64) FreeBlockList {
|
||||
struct alignas(hardware_destructive_interference_size) FreeBlockList {
|
||||
std::mutex mutex_;
|
||||
std::deque<B*> list_;
|
||||
};
|
||||
@ -122,7 +122,7 @@ struct TORCH_API HostStats {
|
||||
// Struct containing memory allocator summary statistics for host, as they
|
||||
// are staged for reporting. This is a temporary struct that is used to
|
||||
// avoid locking the allocator while collecting stats.
|
||||
struct alignas(64) HostStatsStaged {
|
||||
struct alignas(hardware_destructive_interference_size) HostStatsStaged {
|
||||
std::mutex timing_mutex_;
|
||||
// COUNT: total allocations (active + free)
|
||||
// LOCK: access to this stat is protected by the allocator's blocks_mutex_
|
||||
@ -669,7 +669,7 @@ struct CachingHostAllocatorImpl {
|
||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for query_event");
|
||||
}
|
||||
|
||||
alignas(64) std::mutex blocks_mutex_;
|
||||
alignas(hardware_destructive_interference_size) std::mutex blocks_mutex_;
|
||||
ska::flat_hash_set<B*> blocks_; // block list
|
||||
ska::flat_hash_map<void*, B*> ptr_to_block_;
|
||||
|
||||
@ -677,17 +677,17 @@ struct CachingHostAllocatorImpl {
|
||||
// size. This allows us to quickly find a free block of the right size.
|
||||
// We use deque to store per size free list and guard the list with its own
|
||||
// mutex.
|
||||
alignas(64) std::vector<FreeBlockList<B>> free_list_ =
|
||||
alignas(hardware_destructive_interference_size) std::vector<FreeBlockList<B>> free_list_ =
|
||||
std::vector<FreeBlockList<B>>(MAX_SIZE_INDEX);
|
||||
|
||||
alignas(64) std::mutex events_mutex_;
|
||||
alignas(hardware_destructive_interference_size) std::mutex events_mutex_;
|
||||
std::deque<std::pair<E, B*>> events_; // event queue paired with block
|
||||
|
||||
// Indicates whether the object is active.
|
||||
// Set to false in the destructor to signal background threads to stop.
|
||||
std::atomic<bool> active_{true};
|
||||
protected:
|
||||
alignas(64) HostStatsStaged stats_;
|
||||
alignas(hardware_destructive_interference_size) HostStatsStaged stats_;
|
||||
};
|
||||
|
||||
struct TORCH_API HostAllocator : public at::Allocator {
|
||||
|
||||
@ -59,9 +59,7 @@ struct TORCH_API Generator {
|
||||
|
||||
explicit Generator(c10::intrusive_ptr<c10::GeneratorImpl> gen_impl)
|
||||
: impl_(std::move(gen_impl)) {
|
||||
if (impl_.get() == nullptr) {
|
||||
throw std::runtime_error("GeneratorImpl with nullptr is not supported");
|
||||
}
|
||||
TORCH_CHECK(impl_.get(), "GeneratorImpl with nullptr is not supported");
|
||||
}
|
||||
|
||||
bool operator==(const Generator& rhs) const {
|
||||
|
||||
@ -111,9 +111,7 @@ class TORCH_API TensorBase {
|
||||
explicit TensorBase(
|
||||
c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
|
||||
: impl_(std::move(tensor_impl)) {
|
||||
if (impl_.get() == nullptr) {
|
||||
throw std::runtime_error("TensorImpl with nullptr is not supported");
|
||||
}
|
||||
TORCH_CHECK(impl_.get(), "TensorImpl with nullptr is not supported");
|
||||
}
|
||||
TensorBase(const TensorBase&) = default;
|
||||
TensorBase(TensorBase&&) noexcept = default;
|
||||
|
||||
@ -68,11 +68,7 @@ Symbol InternedStrings::_symbol(const std::string& s) {
|
||||
return it->second;
|
||||
|
||||
auto pos = s.find("::");
|
||||
if (pos == std::string::npos) {
|
||||
std::stringstream ss;
|
||||
ss << "all symbols must have a namespace, <namespace>::<string>, but found: " << s;
|
||||
throw std::runtime_error(ss.str());
|
||||
}
|
||||
TORCH_CHECK(pos != std::string::npos, "all symbols must have a namespace, <namespace>::<string>, but found: ", s);
|
||||
Symbol ns = _symbol("namespaces::" + s.substr(0, pos));
|
||||
|
||||
Symbol sym(sym_to_info_.size());
|
||||
@ -121,12 +117,7 @@ std::string Symbol::domainString() const {
|
||||
}
|
||||
|
||||
Symbol Symbol::fromDomainAndUnqualString(const std::string & d, const std::string & s) {
|
||||
if (d.compare(0, domain_prefix().size(), domain_prefix()) != 0) {
|
||||
std::ostringstream ss;
|
||||
ss << "Symbol: domain string is expected to be prefixed with '"
|
||||
<< domain_prefix() << "', e.g. 'org.pytorch.aten'";
|
||||
throw std::runtime_error(ss.str());
|
||||
}
|
||||
TORCH_CHECK(d.compare(0, domain_prefix().size(), domain_prefix()) == 0, "Symbol: domain string is expected to be prefixed with '", domain_prefix(), "', e.g. 'org.pytorch.aten'");
|
||||
std::string qualString = d.substr(domain_prefix().size()) + "::" + s;
|
||||
return fromQualString(qualString);
|
||||
}
|
||||
|
||||
@ -7,6 +7,7 @@
|
||||
#include <ATen/core/jit_type.h>
|
||||
#include <ATen/core/stack.h>
|
||||
#include <ATen/core/type_factory.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/StringUtil.h>
|
||||
#include <c10/util/hash.h>
|
||||
#include <c10/util/irange.h>
|
||||
@ -412,7 +413,7 @@ size_t IValue::hash(const IValue& v) {
|
||||
case Tag::Enum:
|
||||
case Tag::Stream:
|
||||
case Tag::Uninitialized:
|
||||
throw std::runtime_error(
|
||||
TORCH_CHECK(false,
|
||||
"unhashable type: '" + v.type()->repr_str() + "'");
|
||||
}
|
||||
// the above switch should be exhaustive
|
||||
|
||||
@ -8,6 +8,7 @@
|
||||
#include <ATen/core/type_factory.h>
|
||||
#include <ATen/core/qualified_name.h>
|
||||
#include <c10/util/TypeList.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <optional>
|
||||
#include <c10/core/SymFloat.h>
|
||||
#include <c10/core/SymBool.h>
|
||||
@ -116,10 +117,8 @@ struct SingleElementType : public SharedType {
|
||||
|
||||
protected:
|
||||
SingleElementType(TypePtr elem) : SharedType(Kind), elem(std::move(elem)) {
|
||||
if (!this->elem) {
|
||||
throw std::runtime_error(c10::str(
|
||||
TORCH_CHECK(this->elem, c10::str(
|
||||
"Can not create ", typeKindToString(Kind), " with None type"));
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
@ -416,16 +415,12 @@ struct TORCH_API SymbolicShape {
|
||||
}
|
||||
|
||||
ShapeSymbol operator[](size_t i) const {
|
||||
if (!dims_) {
|
||||
throw std::runtime_error("Rank isn't fixed");
|
||||
}
|
||||
TORCH_CHECK(dims_, "Rank isn't fixed");
|
||||
return (*dims_).at(i);
|
||||
}
|
||||
|
||||
ShapeSymbol at(size_t i) const {
|
||||
if (!dims_) {
|
||||
throw std::runtime_error("Rank isn't fixed");
|
||||
}
|
||||
TORCH_CHECK(dims_, "Rank isn't fixed");
|
||||
return (*dims_).at(i);
|
||||
}
|
||||
|
||||
@ -520,9 +515,7 @@ struct VaryingShape {
|
||||
}
|
||||
|
||||
const std::optional<T> &operator[](size_t i) const {
|
||||
if (!dims_) {
|
||||
throw std::runtime_error("Rank isn't fixed");
|
||||
}
|
||||
TORCH_CHECK(dims_, "Rank isn't fixed");
|
||||
return (*dims_).at(i);
|
||||
}
|
||||
|
||||
@ -957,9 +950,7 @@ struct TORCH_API DictType : public SharedType {
|
||||
|
||||
TypePtr createWithContained(
|
||||
std::vector<TypePtr> contained_types) const override {
|
||||
if (contained_types.size() != 2) {
|
||||
throw std::runtime_error("Expected 2 contained types");
|
||||
}
|
||||
TORCH_CHECK(contained_types.size() == 2, "Expected 2 contained types");
|
||||
return create(std::move(contained_types.at(0)), std::move(contained_types.at(1)));
|
||||
}
|
||||
|
||||
|
||||
@ -8,6 +8,7 @@
|
||||
#include <ATen/core/jit_type.h>
|
||||
#include <c10/macros/Macros.h>
|
||||
#include <c10/util/env.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/flat_hash_map.h>
|
||||
#include <c10/util/irange.h>
|
||||
#include <array>
|
||||
@ -826,9 +827,7 @@ TupleType::TupleType(
|
||||
: NamedType(TypeKind::TupleType, std::move(name)),
|
||||
elements_(std::move(elements)),
|
||||
has_free_variables_(std::any_of(elements_.begin(), elements_.end(), [](const TypePtr& v) {
|
||||
if (!v) {
|
||||
throw std::runtime_error("Can not create tuple with None type");
|
||||
}
|
||||
TORCH_CHECK(v, "Can not create tuple with None type");
|
||||
return v->hasFreeVariables();
|
||||
})), schema_(std::move(schema)) {
|
||||
|
||||
|
||||
@ -9,6 +9,7 @@
|
||||
#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
|
||||
#include <ATen/cpu/vec/vec128/vec128_half_neon.h>
|
||||
#include <ATen/cpu/vec/vec128/vec128_int_aarch64.h>
|
||||
#include <ATen/cpu/vec/vec128/vec128_uint_aarch64.h>
|
||||
#endif
|
||||
|
||||
#include <ATen/cpu/vec/vec128/vec128_convert.h>
|
||||
|
||||
@ -354,9 +354,47 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
|
||||
|
||||
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(abs)
|
||||
Vectorized frac() const;
|
||||
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(neg)
|
||||
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(trunc)
|
||||
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(sqrt)
|
||||
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
Vectorized<c10::BFloat16> neg() const {
|
||||
return -values;
|
||||
}
|
||||
Vectorized<c10::BFloat16> reciprocal() const {
|
||||
return 1.0f / values;
|
||||
}
|
||||
Vectorized<c10::BFloat16> operator==(
|
||||
const Vectorized<c10::BFloat16>& other) const {
|
||||
return values == other.values;
|
||||
}
|
||||
|
||||
Vectorized<c10::BFloat16> operator!=(
|
||||
const Vectorized<c10::BFloat16>& other) const {
|
||||
return values != other.values;
|
||||
}
|
||||
|
||||
Vectorized<c10::BFloat16> operator<(
|
||||
const Vectorized<c10::BFloat16>& other) const {
|
||||
return values < other.values;
|
||||
}
|
||||
|
||||
Vectorized<c10::BFloat16> operator<=(
|
||||
const Vectorized<c10::BFloat16>& other) const {
|
||||
return values <= other.values;
|
||||
}
|
||||
|
||||
Vectorized<c10::BFloat16> operator>(
|
||||
const Vectorized<c10::BFloat16>& other) const {
|
||||
return values > other.values;
|
||||
}
|
||||
|
||||
Vectorized<c10::BFloat16> operator>=(
|
||||
const Vectorized<c10::BFloat16>& other) const {
|
||||
return values >= other.values;
|
||||
}
|
||||
#else
|
||||
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(neg)
|
||||
DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(reciprocal)
|
||||
DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator==)
|
||||
DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator!=)
|
||||
@ -364,6 +402,7 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
|
||||
DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator<=)
|
||||
DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator>)
|
||||
DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator>=)
|
||||
#endif
|
||||
|
||||
#undef DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD
|
||||
#undef DEFINE_BINARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD
|
||||
@ -412,28 +451,52 @@ template <>
|
||||
Vectorized<c10::BFloat16> inline operator+(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b) {
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
bfloat16x8_t x = a;
|
||||
bfloat16x8_t y = b;
|
||||
return x + y;
|
||||
#else
|
||||
return binary_operator_via_float(std::plus<Vectorized<float>>(), a, b);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<c10::BFloat16> inline operator-(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b) {
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
bfloat16x8_t x = a;
|
||||
bfloat16x8_t y = b;
|
||||
return x - y;
|
||||
#else
|
||||
return binary_operator_via_float(std::minus<Vectorized<float>>(), a, b);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<c10::BFloat16> inline operator*(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b) {
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
bfloat16x8_t x = a;
|
||||
bfloat16x8_t y = b;
|
||||
return x * y;
|
||||
#else
|
||||
return binary_operator_via_float(std::multiplies<Vectorized<float>>(), a, b);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<c10::BFloat16> inline operator/(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b) {
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
bfloat16x8_t x = a;
|
||||
bfloat16x8_t y = b;
|
||||
return x / y;
|
||||
#else
|
||||
return binary_operator_via_float(std::divides<Vectorized<float>>(), a, b);
|
||||
#endif
|
||||
}
|
||||
|
||||
// frac. Implement this here so we can use subtraction
|
||||
@ -544,12 +607,19 @@ Vectorized<c10::BFloat16> inline fmadd(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b,
|
||||
const Vectorized<c10::BFloat16>& c) {
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
bfloat16x8_t x = a;
|
||||
bfloat16x8_t y = b;
|
||||
bfloat16x8_t z = c;
|
||||
return x * y + z;
|
||||
#else
|
||||
// NOTE [BF16 FMA]: There isn't an FMA that accumulates into BF16! Also,
|
||||
// vbfmlalbq_f32 and vbfmlaltq_f32 take the even and odd-numbered
|
||||
// elements, not the bottom and top half, so they don't seem
|
||||
// particularly useful here. Ideally we would include dot product in
|
||||
// the Vectorized interface...
|
||||
return a * b + c;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
@ -557,8 +627,15 @@ Vectorized<c10::BFloat16> inline fnmadd(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b,
|
||||
const Vectorized<c10::BFloat16>& c) {
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
bfloat16x8_t x = a;
|
||||
bfloat16x8_t y = b;
|
||||
bfloat16x8_t z = c;
|
||||
return (-x) * y + z;
|
||||
#else
|
||||
// See NOTE [BF16 FMA] above.
|
||||
return -a * b + c;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
@ -566,8 +643,15 @@ Vectorized<c10::BFloat16> inline fmsub(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b,
|
||||
const Vectorized<c10::BFloat16>& c) {
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
bfloat16x8_t x = a;
|
||||
bfloat16x8_t y = b;
|
||||
bfloat16x8_t z = c;
|
||||
return x * y - z;
|
||||
#else
|
||||
// See NOTE [BF16 FMA] above.
|
||||
return a * b - c;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
@ -575,8 +659,15 @@ Vectorized<c10::BFloat16> inline fnmsub(
|
||||
const Vectorized<c10::BFloat16>& a,
|
||||
const Vectorized<c10::BFloat16>& b,
|
||||
const Vectorized<c10::BFloat16>& c) {
|
||||
#ifdef __ARM_FEATURE_BF16
|
||||
bfloat16x8_t x = a;
|
||||
bfloat16x8_t y = b;
|
||||
bfloat16x8_t z = c;
|
||||
return (-x) * y - z;
|
||||
#else
|
||||
// See NOTE [BF16 FMA] above.
|
||||
return -a * b - c;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif // !defined(C10_MOBILE) && defined(__aarch64__)
|
||||
|
||||
378
aten/src/ATen/cpu/vec/vec128/vec128_uint_aarch64.h
Normal file
378
aten/src/ATen/cpu/vec/vec128/vec128_uint_aarch64.h
Normal file
@ -0,0 +1,378 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/cpu/vec/intrinsics.h>
|
||||
#include <ATen/cpu/vec/vec_base.h>
|
||||
#include <c10/macros/Macros.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
namespace at::vec {
|
||||
// Note [CPU_CAPABILITY namespace]
|
||||
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
// This header, and all of its subheaders, will be compiled with
|
||||
// different architecture flags for each supported set of vector
|
||||
// intrinsics. So we need to make sure they aren't inadvertently
|
||||
// linked together. We do this by declaring objects in an `inline
|
||||
// namespace` which changes the name mangling, but can still be
|
||||
// accessed as `at::vec`.
|
||||
inline namespace CPU_CAPABILITY {
|
||||
|
||||
#define VEC_UINT_NEON_TEMPLATE(vl, bit) \
|
||||
template <> \
|
||||
struct is_vec_specialized_for<uint##bit##_t> : std::bool_constant<true> {}; \
|
||||
\
|
||||
template <> \
|
||||
class Vectorized<uint##bit##_t> { \
|
||||
using neon_type = uint##bit##x##vl##_t; \
|
||||
\
|
||||
private: \
|
||||
neon_type values; \
|
||||
\
|
||||
public: \
|
||||
using value_type = uint##bit##_t; \
|
||||
using size_type = int; \
|
||||
static constexpr size_type size() { \
|
||||
return vl; \
|
||||
} \
|
||||
Vectorized() { \
|
||||
values = vdupq_n_u##bit(0); \
|
||||
} \
|
||||
Vectorized(neon_type v) : values(v) {} \
|
||||
Vectorized(uint##bit##_t val); \
|
||||
template < \
|
||||
typename... Args, \
|
||||
typename = std::enable_if_t<(sizeof...(Args) == size())>> \
|
||||
Vectorized(Args... vals) { \
|
||||
__at_align__ uint##bit##_t buffer[size()] = {vals...}; \
|
||||
values = vld1q_u##bit(buffer); \
|
||||
} \
|
||||
operator neon_type() const { \
|
||||
return values; \
|
||||
} \
|
||||
static Vectorized<uint##bit##_t> loadu( \
|
||||
const void* ptr, \
|
||||
uint64_t count = size()); \
|
||||
void store(void* ptr, uint64_t count = size()) const; \
|
||||
template <uint64_t mask> \
|
||||
static Vectorized<uint##bit##_t> blend( \
|
||||
const Vectorized<uint##bit##_t>& a, \
|
||||
const Vectorized<uint##bit##_t>& b); \
|
||||
static Vectorized<uint##bit##_t> blendv( \
|
||||
const Vectorized<uint##bit##_t>& a, \
|
||||
const Vectorized<uint##bit##_t>& b, \
|
||||
const Vectorized<uint##bit##_t>& mask_) { \
|
||||
return vbslq_u##bit(mask_.values, b, a); \
|
||||
} \
|
||||
template <typename step_t> \
|
||||
static Vectorized<uint##bit##_t> arange( \
|
||||
value_type base = 0, \
|
||||
step_t step = static_cast<step_t>(1)); \
|
||||
static Vectorized<uint##bit##_t> set( \
|
||||
const Vectorized<uint##bit##_t>& a, \
|
||||
const Vectorized<uint##bit##_t>& b, \
|
||||
uint64_t count = size()); \
|
||||
const uint##bit##_t& operator[](uint idx) const = delete; \
|
||||
uint##bit##_t& operator[](uint idx) = delete; \
|
||||
Vectorized<uint##bit##_t> abs() const { \
|
||||
return values; \
|
||||
} \
|
||||
Vectorized<uint##bit##_t> real() const { \
|
||||
return values; \
|
||||
} \
|
||||
Vectorized<uint##bit##_t> imag() const { \
|
||||
return vdupq_n_u##bit(0); \
|
||||
} \
|
||||
Vectorized<uint##bit##_t> conj() const { \
|
||||
return values; \
|
||||
} \
|
||||
Vectorized<uint##bit##_t> neg() const { \
|
||||
return vreinterpretq_u##bit##_s##bit( \
|
||||
vnegq_s##bit(vreinterpretq_s##bit##_u##bit(values))); \
|
||||
} \
|
||||
uint##bit##_t reduce_add() const { \
|
||||
return vaddvq_u##bit(values); \
|
||||
} \
|
||||
uint##bit##_t reduce_max() const; \
|
||||
Vectorized<uint##bit##_t> operator==( \
|
||||
const Vectorized<uint##bit##_t>& other) const { \
|
||||
return Vectorized<value_type>(vceqq_u##bit(values, other.values)); \
|
||||
} \
|
||||
Vectorized<uint##bit##_t> operator!=( \
|
||||
const Vectorized<uint##bit##_t>& other) const; \
|
||||
Vectorized<uint##bit##_t> operator<( \
|
||||
const Vectorized<uint##bit##_t>& other) const { \
|
||||
return Vectorized<value_type>(vcltq_u##bit(values, other.values)); \
|
||||
} \
|
||||
Vectorized<uint##bit##_t> operator<=( \
|
||||
const Vectorized<uint##bit##_t>& other) const { \
|
||||
return Vectorized<value_type>(vcleq_u##bit(values, other.values)); \
|
||||
} \
|
||||
Vectorized<uint##bit##_t> operator>( \
|
||||
const Vectorized<uint##bit##_t>& other) const { \
|
||||
return Vectorized<value_type>(vcgtq_u##bit(values, other.values)); \
|
||||
} \
|
||||
Vectorized<uint##bit##_t> operator>=( \
|
||||
const Vectorized<uint##bit##_t>& other) const { \
|
||||
return Vectorized<value_type>(vcgeq_u##bit(values, other.values)); \
|
||||
} \
|
||||
Vectorized<uint##bit##_t> eq( \
|
||||
const Vectorized<uint##bit##_t>& other) const; \
|
||||
Vectorized<uint##bit##_t> ne( \
|
||||
const Vectorized<uint##bit##_t>& other) const; \
|
||||
Vectorized<uint##bit##_t> gt( \
|
||||
const Vectorized<uint##bit##_t>& other) const; \
|
||||
Vectorized<uint##bit##_t> ge( \
|
||||
const Vectorized<uint##bit##_t>& other) const; \
|
||||
Vectorized<uint##bit##_t> lt( \
|
||||
const Vectorized<uint##bit##_t>& other) const; \
|
||||
Vectorized<uint##bit##_t> le( \
|
||||
const Vectorized<uint##bit##_t>& other) const; \
|
||||
}; \
|
||||
template <> \
|
||||
Vectorized<uint##bit##_t> inline operator+( \
|
||||
const Vectorized<uint##bit##_t>& a, \
|
||||
const Vectorized<uint##bit##_t>& b) { \
|
||||
return vaddq_u##bit(a, b); \
|
||||
} \
|
||||
template <> \
|
||||
Vectorized<uint##bit##_t> inline operator-( \
|
||||
const Vectorized<uint##bit##_t>& a, \
|
||||
const Vectorized<uint##bit##_t>& b) { \
|
||||
return vsubq_u##bit(a, b); \
|
||||
} \
|
||||
template <> \
|
||||
Vectorized<uint##bit##_t> inline operator&( \
|
||||
const Vectorized<uint##bit##_t>& a, \
|
||||
const Vectorized<uint##bit##_t>& b) { \
|
||||
return vandq_u##bit(a, b); \
|
||||
} \
|
||||
template <> \
|
||||
Vectorized<uint##bit##_t> inline operator|( \
|
||||
const Vectorized<uint##bit##_t>& a, \
|
||||
const Vectorized<uint##bit##_t>& b) { \
|
||||
return vorrq_u##bit(a, b); \
|
||||
} \
|
||||
template <> \
|
||||
Vectorized<uint##bit##_t> inline operator^( \
|
||||
const Vectorized<uint##bit##_t>& a, \
|
||||
const Vectorized<uint##bit##_t>& b) { \
|
||||
return veorq_u##bit(a, b); \
|
||||
} \
|
||||
Vectorized<uint##bit##_t> inline Vectorized<uint##bit##_t>::eq( \
|
||||
const Vectorized<uint##bit##_t>& other) const { \
|
||||
return (*this == other) & Vectorized<uint##bit##_t>(1); \
|
||||
} \
|
||||
Vectorized<uint##bit##_t> inline Vectorized<uint##bit##_t>::ne( \
|
||||
const Vectorized<uint##bit##_t>& other) const { \
|
||||
return (*this != other) & Vectorized<uint##bit##_t>(1); \
|
||||
} \
|
||||
Vectorized<uint##bit##_t> inline Vectorized<uint##bit##_t>::gt( \
|
||||
const Vectorized<uint##bit##_t>& other) const { \
|
||||
return (*this > other) & Vectorized<uint##bit##_t>(1); \
|
||||
} \
|
||||
Vectorized<uint##bit##_t> inline Vectorized<uint##bit##_t>::ge( \
|
||||
const Vectorized<uint##bit##_t>& other) const { \
|
||||
return (*this >= other) & Vectorized<uint##bit##_t>(1); \
|
||||
} \
|
||||
Vectorized<uint##bit##_t> inline Vectorized<uint##bit##_t>::lt( \
|
||||
const Vectorized<uint##bit##_t>& other) const { \
|
||||
return (*this < other) & Vectorized<uint##bit##_t>(1); \
|
||||
} \
|
||||
Vectorized<uint##bit##_t> inline Vectorized<uint##bit##_t>::le( \
|
||||
const Vectorized<uint##bit##_t>& other) const { \
|
||||
return (*this <= other) & Vectorized<uint##bit##_t>(1); \
|
||||
}
|
||||
|
||||
VEC_UINT_NEON_TEMPLATE(16, 8)
|
||||
|
||||
inline uint8_t Vectorized<uint8_t>::reduce_max() const {
|
||||
return vmaxvq_u8(values);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<uint8_t> inline operator*(
|
||||
const Vectorized<uint8_t>& a,
|
||||
const Vectorized<uint8_t>& b) {
|
||||
return vmulq_u8(a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline Vectorized<uint8_t> operator~(const Vectorized<uint8_t>& a) {
|
||||
return vmvnq_u8(a);
|
||||
}
|
||||
|
||||
inline Vectorized<uint8_t> Vectorized<uint8_t>::operator!=(
|
||||
const Vectorized<uint8_t>& other) const {
|
||||
return ~(*this == other);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<uint8_t> inline minimum(
|
||||
const Vectorized<uint8_t>& a,
|
||||
const Vectorized<uint8_t>& b) {
|
||||
return vminq_u8(a, b);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<uint8_t> inline maximum(
|
||||
const Vectorized<uint8_t>& a,
|
||||
const Vectorized<uint8_t>& b) {
|
||||
return vmaxq_u8(a, b);
|
||||
}
|
||||
|
||||
template <uint64_t mask>
|
||||
Vectorized<uint8_t> Vectorized<uint8_t>::blend(
|
||||
const Vectorized<uint8_t>& a,
|
||||
const Vectorized<uint8_t>& b) {
|
||||
// Build an array of flags: each bit of element is 1 if the corresponding bit
|
||||
// in 'mask' is set, 0 otherwise.
|
||||
uint8x16_t maskArray = {
|
||||
(mask & 1LL) ? 0xFF : 0,
|
||||
(mask & 2LL) ? 0xFF : 0,
|
||||
(mask & 4LL) ? 0xFF : 0,
|
||||
(mask & 8LL) ? 0xFF : 0,
|
||||
(mask & 16LL) ? 0xFF : 0,
|
||||
(mask & 32LL) ? 0xFF : 0,
|
||||
(mask & 64LL) ? 0xFF : 0,
|
||||
(mask & 128LL) ? 0xFF : 0,
|
||||
(mask & 256LL) ? 0xFF : 0,
|
||||
(mask & 512LL) ? 0xFF : 0,
|
||||
(mask & 1024LL) ? 0xFF : 0,
|
||||
(mask & 2048LL) ? 0xFF : 0,
|
||||
(mask & 4096LL) ? 0xFF : 0,
|
||||
(mask & 8192LL) ? 0xFF : 0,
|
||||
(mask & 16384LL) ? 0xFF : 0,
|
||||
(mask & 32768LL) ? 0xFF : 0};
|
||||
// Use BSL to select elements from b where the mask is 1, else from a
|
||||
return vbslq_u8(maskArray, b.values, a.values);
|
||||
}
|
||||
|
||||
#define VEC_UINT_NEON_OPS(vl, bit) \
|
||||
inline Vectorized<uint##bit##_t>::Vectorized(uint##bit##_t val) { \
|
||||
values = vdupq_n_u##bit(val); \
|
||||
} \
|
||||
inline Vectorized<uint##bit##_t> Vectorized<uint##bit##_t>::loadu( \
|
||||
const void* ptr, uint64_t count) { \
|
||||
if (count == size()) { \
|
||||
return vld1q_u##bit(reinterpret_cast<const uint##bit##_t*>(ptr)); \
|
||||
} else { \
|
||||
__at_align__ uint##bit##_t tmp_values[size()]; \
|
||||
for (const auto i : c10::irange(size())) { \
|
||||
tmp_values[i] = 0; \
|
||||
} \
|
||||
std::memcpy( \
|
||||
tmp_values, \
|
||||
reinterpret_cast<const uint##bit##_t*>(ptr), \
|
||||
count * sizeof(uint##bit##_t)); \
|
||||
return vld1q_u##bit(reinterpret_cast<const uint##bit##_t*>(tmp_values)); \
|
||||
} \
|
||||
} \
|
||||
inline void Vectorized<uint##bit##_t>::store(void* ptr, uint64_t count) \
|
||||
const { \
|
||||
if (count == size()) { \
|
||||
vst1q_u##bit(reinterpret_cast<uint##bit##_t*>(ptr), values); \
|
||||
} else { \
|
||||
uint##bit##_t tmp_values[size()]; \
|
||||
vst1q_u##bit(reinterpret_cast<uint##bit##_t*>(tmp_values), values); \
|
||||
std::memcpy(ptr, tmp_values, count * sizeof(uint##bit##_t)); \
|
||||
} \
|
||||
}
|
||||
|
||||
VEC_UINT_NEON_OPS(16, 8)
|
||||
|
||||
template <typename step_t>
|
||||
inline Vectorized<uint8_t> Vectorized<uint8_t>::arange(
|
||||
uint8_t base,
|
||||
step_t step) {
|
||||
const Vectorized<uint8_t> base_vec(base);
|
||||
const Vectorized<uint8_t> step_vec(step);
|
||||
const uint8x16_t step_sizes = {
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
|
||||
return vmlaq_u8(base_vec, step_sizes, step_vec);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<uint8_t> inline operator>>(
|
||||
const Vectorized<uint8_t>& a,
|
||||
const Vectorized<uint8_t>& b) {
|
||||
uint8x16_t x = a;
|
||||
uint8x16_t bound = vdupq_n_u8(8);
|
||||
uint8x16_t z = vminq_u8(b, bound);
|
||||
return x >> z;
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<uint8_t> inline operator<<(
|
||||
const Vectorized<uint8_t>& a,
|
||||
const Vectorized<uint8_t>& b) {
|
||||
uint8x16_t bound = vdupq_n_u8(8);
|
||||
uint8x16_t z = vminq_u8(b, bound);
|
||||
return vshlq_u8(a, vreinterpretq_s8_u8(z));
|
||||
}
|
||||
|
||||
inline Vectorized<uint8_t> Vectorized<uint8_t>::set(
|
||||
const Vectorized<uint8_t>& a,
|
||||
const Vectorized<uint8_t>& b,
|
||||
uint64_t count) {
|
||||
if (count == 0) {
|
||||
return a;
|
||||
} else if (count >= 16) {
|
||||
return b;
|
||||
} else {
|
||||
// Build an array of flags: each bit of element is 1 if the corresponding
|
||||
// bit in 'mask' is set, 0 otherwise.
|
||||
uint8x16_t maskArray = {
|
||||
static_cast<uint8_t>((count >= 1LL) ? 0xFF : 0),
|
||||
static_cast<uint8_t>((count >= 2LL) ? 0xFF : 0),
|
||||
static_cast<uint8_t>((count >= 3LL) ? 0xFF : 0),
|
||||
static_cast<uint8_t>((count >= 4LL) ? 0xFF : 0),
|
||||
static_cast<uint8_t>((count >= 5LL) ? 0xFF : 0),
|
||||
static_cast<uint8_t>((count >= 6LL) ? 0xFF : 0),
|
||||
static_cast<uint8_t>((count >= 7LL) ? 0xFF : 0),
|
||||
static_cast<uint8_t>((count >= 8LL) ? 0xFF : 0),
|
||||
static_cast<uint8_t>((count >= 9LL) ? 0xFF : 0),
|
||||
static_cast<uint8_t>((count >= 10LL) ? 0xFF : 0),
|
||||
static_cast<uint8_t>((count >= 11LL) ? 0xFF : 0),
|
||||
static_cast<uint8_t>((count >= 12LL) ? 0xFF : 0),
|
||||
static_cast<uint8_t>((count >= 13LL) ? 0xFF : 0),
|
||||
static_cast<uint8_t>((count >= 14LL) ? 0xFF : 0),
|
||||
static_cast<uint8_t>((count >= 15LL) ? 0xFF : 0),
|
||||
0};
|
||||
|
||||
// Use BSL to select elements from b where the mask is 1, else from a
|
||||
return vbslq_u8(maskArray, b.values, a.values);
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<uint8_t> inline operator/(
|
||||
const Vectorized<uint8_t>& a,
|
||||
const Vectorized<uint8_t>& b) {
|
||||
uint8x16_t x = a;
|
||||
uint8x16_t y = b;
|
||||
return x / y;
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<uint8_t> inline clamp(
|
||||
const Vectorized<uint8_t>& a,
|
||||
const Vectorized<uint8_t>& min,
|
||||
const Vectorized<uint8_t>& max) {
|
||||
return minimum(max, maximum(min, a));
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<uint8_t> inline clamp_max(
|
||||
const Vectorized<uint8_t>& a,
|
||||
const Vectorized<uint8_t>& max) {
|
||||
return minimum(max, a);
|
||||
}
|
||||
|
||||
template <>
|
||||
Vectorized<uint8_t> inline clamp_min(
|
||||
const Vectorized<uint8_t>& a,
|
||||
const Vectorized<uint8_t>& min) {
|
||||
return maximum(min, a);
|
||||
}
|
||||
|
||||
} // namespace CPU_CAPABILITY
|
||||
} // namespace at::vec
|
||||
@ -1390,7 +1390,7 @@ std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
|
||||
|
||||
std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
|
||||
at::vec::Vectorized<uint8_t> src) {
|
||||
auto u8x8 = vld1_u8(src.operator const uint8_t*());
|
||||
auto u8x8 = vget_low_u8(src);
|
||||
auto u16x8 = vmovl_u8(u8x8);
|
||||
auto u32x4_hi = vmovl_u16(vget_high_u16(u16x8));
|
||||
auto u32x4_lo = vmovl_u16(vget_low_u16(u16x8));
|
||||
@ -1412,7 +1412,7 @@ Vectorized<float> inline convert_int8_half_register_to_float(
|
||||
|
||||
Vectorized<float> inline convert_int8_half_register_to_float(
|
||||
at::vec::Vectorized<uint8_t> src) {
|
||||
auto u8x8 = vld1_u8(src.operator const uint8_t*());
|
||||
auto u8x8 = vget_low_u8(src);
|
||||
auto u16x8 = vmovl_u8(u8x8);
|
||||
auto u32x4_lo = vmovl_u16(vget_low_u16(u16x8));
|
||||
|
||||
|
||||
192
aten/src/ATen/cuda/CUDAGreenContext.cpp
Normal file
192
aten/src/ATen/cuda/CUDAGreenContext.cpp
Normal file
@ -0,0 +1,192 @@
|
||||
#include <ATen/cuda/CUDAGreenContext.h>
|
||||
|
||||
namespace at::cuda {
|
||||
GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
int driver_version;
|
||||
C10_CUDA_CHECK(cudaDriverGetVersion(&driver_version));
|
||||
TORCH_CHECK(
|
||||
driver_version >= 12080, "cuda driver too old to use green context!");
|
||||
CUcontext pctx = nullptr;
|
||||
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuCtxGetCurrent_(&pctx));
|
||||
if (C10_UNLIKELY(!pctx)) {
|
||||
TORCH_WARN(
|
||||
"Attempted to create a green context but"
|
||||
" there was no primary context! Creating a primary context...");
|
||||
|
||||
cudaFree(0);
|
||||
}
|
||||
|
||||
CUdevice device;
|
||||
device_id_ = device_id;
|
||||
C10_CUDA_DRIVER_CHECK(
|
||||
c10::cuda::DriverAPI::get()->cuDeviceGet_(&device, device_id));
|
||||
|
||||
// Get device resources
|
||||
CUdevResource device_resource;
|
||||
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuDeviceGetDevResource_(
|
||||
device, &device_resource, CU_DEV_RESOURCE_TYPE_SM));
|
||||
|
||||
// Split resources
|
||||
std::vector<CUdevResource> result(1);
|
||||
auto result_data = result.data();
|
||||
unsigned int nb_groups = 1;
|
||||
CUdevResource remaining;
|
||||
|
||||
C10_CUDA_DRIVER_CHECK(
|
||||
c10::cuda::DriverAPI::get()->cuDevSmResourceSplitByCount_(
|
||||
result_data,
|
||||
&nb_groups,
|
||||
&device_resource,
|
||||
&remaining,
|
||||
0, // default flags
|
||||
num_sms));
|
||||
|
||||
TORCH_CHECK(nb_groups == 1, "Failed to create single resource group");
|
||||
|
||||
// Generate resource descriptor
|
||||
CUdevResourceDesc desc;
|
||||
C10_CUDA_DRIVER_CHECK(
|
||||
c10::cuda::DriverAPI::get()->cuDevResourceGenerateDesc_(
|
||||
&desc, result_data, 1));
|
||||
|
||||
// Create green context
|
||||
// CU_GREEN_CTX_DEFAULT_STREAM is required per docs:
|
||||
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GREEN__CONTEXTS.html
|
||||
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuGreenCtxCreate_(
|
||||
&green_ctx_, desc, device, CU_GREEN_CTX_DEFAULT_STREAM));
|
||||
|
||||
// Convert to regular context
|
||||
C10_CUDA_DRIVER_CHECK(
|
||||
c10::cuda::DriverAPI::get()->cuCtxFromGreenCtx_(&context_, green_ctx_));
|
||||
TORCH_CHECK(context_, "Green ctx conversion to regular ctx failed!");
|
||||
#else
|
||||
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
|
||||
#endif
|
||||
}
|
||||
|
||||
std::unique_ptr<GreenContext> GreenContext::create(
|
||||
uint32_t num_sms,
|
||||
std::optional<uint32_t> device_id) {
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
if (!device_id.has_value()) {
|
||||
device_id = at::cuda::current_device();
|
||||
}
|
||||
return std::make_unique<GreenContext>(device_id.value(), num_sms);
|
||||
#else
|
||||
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
|
||||
#endif
|
||||
}
|
||||
|
||||
// Implement move operations
|
||||
GreenContext::GreenContext(GreenContext&& other) noexcept{
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
device_id_ = std::exchange(other.device_id_, -1);
|
||||
green_ctx_ = std::exchange(other.green_ctx_, nullptr);
|
||||
context_ = std::exchange(other.context_, nullptr);
|
||||
parent_stream_ = std::exchange(other.parent_stream_, nullptr);
|
||||
#else
|
||||
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
|
||||
#endif
|
||||
}
|
||||
|
||||
GreenContext& GreenContext::operator=(GreenContext&& other) noexcept{
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
if (this != &other) {
|
||||
// Clean up current resources
|
||||
if (green_ctx_) {
|
||||
CUcontext current = nullptr;
|
||||
C10_CUDA_DRIVER_CHECK(
|
||||
c10::cuda::DriverAPI::get()->cuCtxGetCurrent_(¤t));
|
||||
if (current == context_) {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"attempting to overwrite current green ctx "
|
||||
"when it is active!");
|
||||
}
|
||||
C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuGreenCtxDestroy_(green_ctx_));
|
||||
}
|
||||
|
||||
// Take ownership of other's resources
|
||||
device_id_ = std::exchange(other.device_id_, -1);
|
||||
green_ctx_ = std::exchange(other.green_ctx_, nullptr);
|
||||
context_ = std::exchange(other.context_, nullptr);
|
||||
parent_stream_ = std::exchange(other.parent_stream_, nullptr);
|
||||
}
|
||||
return *this;
|
||||
#else
|
||||
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
|
||||
#endif
|
||||
}
|
||||
|
||||
GreenContext::~GreenContext() noexcept{
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
C10_CUDA_DRIVER_CHECK(
|
||||
c10::cuda::DriverAPI::get()->cuGreenCtxDestroy_(green_ctx_));
|
||||
#else
|
||||
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
|
||||
#endif
|
||||
}
|
||||
|
||||
// Get the underlying CUDA context
|
||||
CUcontext GreenContext::getContext() const {
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
return context_;
|
||||
#else
|
||||
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
|
||||
#endif
|
||||
}
|
||||
|
||||
// Get the underlying green context
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
CUgreenCtx GreenContext::getGreenContext() const {
|
||||
return green_ctx_;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Make this context current
|
||||
void GreenContext::setContext() {
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
auto current_stream = c10::cuda::getCurrentCUDAStream();
|
||||
parent_stream_ = current_stream.stream();
|
||||
|
||||
at::cuda::CUDAEvent ev;
|
||||
ev.record(current_stream);
|
||||
|
||||
CUcontext current = nullptr;
|
||||
C10_CUDA_DRIVER_CHECK(
|
||||
c10::cuda::DriverAPI::get()->cuCtxGetCurrent_(¤t));
|
||||
if (!current) {
|
||||
C10_CUDA_DRIVER_CHECK(
|
||||
c10::cuda::DriverAPI::get()->cuCtxSetCurrent_(context_));
|
||||
} else {
|
||||
C10_CUDA_DRIVER_CHECK(
|
||||
c10::cuda::DriverAPI::get()->cuCtxPushCurrent_(context_));
|
||||
}
|
||||
// currently hardcodes the new green context to use the default stream
|
||||
// TODO(eqy): consider creating a new stream if e.g., it allows interop
|
||||
// with CUDA Graph captures etc.
|
||||
auto default_stream = c10::cuda::getDefaultCUDAStream();
|
||||
ev.block(default_stream);
|
||||
c10::cuda::setCurrentCUDAStream(default_stream);
|
||||
#else
|
||||
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
|
||||
#endif
|
||||
}
|
||||
|
||||
void GreenContext::popContext() {
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
// see above note about stream being hardcoded to the default stream
|
||||
at::cuda::CUDAEvent ev;
|
||||
ev.record(c10::cuda::getCurrentCUDAStream());
|
||||
CUcontext popped;
|
||||
C10_CUDA_DRIVER_CHECK(
|
||||
c10::cuda::DriverAPI::get()->cuCtxPopCurrent_(&popped));
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
popped == context_, "expected popped context to be the current ctx");
|
||||
ev.block(c10::cuda::getStreamFromExternal(parent_stream_, device_id_));
|
||||
#else
|
||||
TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
|
||||
#endif
|
||||
}
|
||||
} // namespace at::cuda
|
||||
53
aten/src/ATen/cuda/CUDAGreenContext.h
Normal file
53
aten/src/ATen/cuda/CUDAGreenContext.h
Normal file
@ -0,0 +1,53 @@
|
||||
#pragma once
|
||||
#include <ATen/cuda/CUDAEvent.h>
|
||||
|
||||
#if defined(CUDA_VERSION) && !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
|
||||
#include <c10/cuda/driver_api.h>
|
||||
#include <cuda.h>
|
||||
#include <memory>
|
||||
#include <stdexcept>
|
||||
#include <vector>
|
||||
#define CUDA_HAS_GREEN_CONTEXT 1
|
||||
#else
|
||||
#define CUDA_HAS_GREEN_CONTEXT 0
|
||||
#endif
|
||||
|
||||
namespace at::cuda {
|
||||
|
||||
class TORCH_CUDA_CPP_API GreenContext {
|
||||
public:
|
||||
GreenContext(uint32_t device_id, uint32_t num_sms);
|
||||
|
||||
static std::unique_ptr<GreenContext> create(uint32_t num_sms, std::optional<uint32_t> device_id);
|
||||
|
||||
// Delete copy constructor and assignment
|
||||
GreenContext(const GreenContext&) = delete;
|
||||
GreenContext& operator=(const GreenContext&) = delete;
|
||||
|
||||
// Implement move operations
|
||||
GreenContext(GreenContext&& other) noexcept;
|
||||
GreenContext& operator=(GreenContext&& other) noexcept;
|
||||
~GreenContext() noexcept;
|
||||
|
||||
// Get the underlying CUDA context
|
||||
CUcontext getContext() const;
|
||||
|
||||
// Get the underlying green context
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
CUgreenCtx getGreenContext() const;
|
||||
#endif
|
||||
|
||||
// Make this context current
|
||||
void setContext();
|
||||
|
||||
void popContext();
|
||||
|
||||
private:
|
||||
#if CUDA_HAS_GREEN_CONTEXT
|
||||
int32_t device_id_ = -1;
|
||||
CUgreenCtx green_ctx_ = nullptr;
|
||||
CUcontext context_ = nullptr;
|
||||
cudaStream_t parent_stream_ = nullptr;
|
||||
#endif
|
||||
};
|
||||
} // namespace at::cuda
|
||||
@ -70,11 +70,7 @@
|
||||
#define ATEN_CUB_MAXIMUM() NO_ROCM(at_cuda_detail)ROCM_HIPCUB(::cub)::Max()
|
||||
#endif
|
||||
|
||||
#if (!defined(USE_ROCM) && !CUB_SUPPORTS_NV_BFLOAT16()) || defined(USE_ROCM)
|
||||
|
||||
#if !defined(USE_ROCM)
|
||||
namespace at_cuda_detail {
|
||||
#endif
|
||||
#if defined(USE_ROCM)
|
||||
|
||||
// backport https://github.com/NVIDIA/cub/pull/306 for c10::BFloat16
|
||||
|
||||
@ -96,10 +92,6 @@ template <>
|
||||
struct ROCM_HIPCUB(cub)::NumericTraits<c10::BFloat16>:
|
||||
ROCM_HIPCUB(cub)::BaseTraits<ROCM_HIPCUB(cub)::FLOATING_POINT, true, false, unsigned short, c10::BFloat16> {};
|
||||
|
||||
#if !defined(USE_ROCM)
|
||||
} // namespace at_cuda_detail
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(USE_ROCM)
|
||||
@ -121,7 +113,7 @@ struct cuda_type<c10::Half> {
|
||||
using type = __half;
|
||||
};
|
||||
|
||||
#if !defined(USE_ROCM) && CUB_SUPPORTS_NV_BFLOAT16()
|
||||
#if !defined(USE_ROCM)
|
||||
|
||||
template<>
|
||||
struct cuda_type<c10::BFloat16> {
|
||||
@ -203,36 +195,6 @@ __global__ void transform_vals(InputIteratorT1 a, InputIteratorT2 b, OutputItera
|
||||
*out = scan_op(static_cast<acc_t>(*a), static_cast<acc_t>(*b));
|
||||
}
|
||||
|
||||
#if !CUB_SUPPORTS_FUTURE_VALUE()
|
||||
template<typename ValueT, typename InputIteratorT>
|
||||
struct chained_iterator {
|
||||
using iterator_category = std::random_access_iterator_tag;
|
||||
using difference_type = std::ptrdiff_t;
|
||||
using value_type = ValueT;
|
||||
using pointer = ValueT*;
|
||||
using reference = ValueT&;
|
||||
|
||||
InputIteratorT iter;
|
||||
ValueT *first;
|
||||
difference_type offset = 0;
|
||||
|
||||
__device__ ValueT operator[](difference_type i) {
|
||||
i += offset;
|
||||
if (i == 0) {
|
||||
return *first;
|
||||
} else {
|
||||
return ValueT(iter[i - 1]);
|
||||
}
|
||||
}
|
||||
__device__ chained_iterator operator+(difference_type i) {
|
||||
return chained_iterator{iter, first, i};
|
||||
}
|
||||
__device__ ValueT operator*() {
|
||||
return (*this)[0];
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
// even though cub is supposed to support tensors with int_max elements, in reality it doesn't,
|
||||
// so split at int_max/2
|
||||
constexpr int max_cub_size = std::numeric_limits<int>::max() / 2 + 1; // 2**30
|
||||
@ -277,25 +239,6 @@ inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
|
||||
first_elem_ptr,
|
||||
scan_op);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
#if !CUB_SUPPORTS_FUTURE_VALUE()
|
||||
using ArgIndexInputIterator = NO_ROCM(at_cuda_detail)::cub::ArgIndexInputIterator<InputIteratorT>;
|
||||
using tuple = typename ArgIndexInputIterator::value_type;
|
||||
auto input_iter_transform = [=] __device__ (const tuple &x)->input_t {
|
||||
if (x.key == 0) {
|
||||
return *first_elem_ptr;
|
||||
} else {
|
||||
return x.value;
|
||||
}
|
||||
};
|
||||
auto input_ = ATEN_CUB_TRANSFORM_ITERATOR(input_t, decltype(input_iter_transform), ArgIndexInputIterator)(
|
||||
ArgIndexInputIterator(input + i), input_iter_transform);
|
||||
CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceScan::InclusiveScan,
|
||||
input_,
|
||||
output + i,
|
||||
scan_op,
|
||||
size_cub,
|
||||
at::cuda::getCurrentCUDAStream());
|
||||
#else
|
||||
CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceScan::ExclusiveScan,
|
||||
input + i + 1,
|
||||
output + i,
|
||||
@ -303,7 +246,6 @@ inline void inclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
|
||||
::at_cuda_detail::cub::FutureValue<input_t>(first_elem_ptr),
|
||||
size_cub,
|
||||
at::cuda::getCurrentCUDAStream());
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@ -555,16 +497,6 @@ inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
|
||||
first_elem_ptr,
|
||||
scan_op);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
#if !CUB_SUPPORTS_FUTURE_VALUE()
|
||||
auto input_ = impl::chained_iterator<InitValueT, InputIteratorT>{
|
||||
input + i, first_elem_ptr};
|
||||
CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceScan::InclusiveScan,
|
||||
input_,
|
||||
output + i,
|
||||
scan_op,
|
||||
size_cub,
|
||||
at::cuda::getCurrentCUDAStream());
|
||||
#else
|
||||
CUB_WRAPPER(NO_ROCM(at_cuda_detail)::cub::DeviceScan::ExclusiveScan,
|
||||
input + i,
|
||||
output + i,
|
||||
@ -572,7 +504,6 @@ inline void exclusive_scan(InputIteratorT input, OutputIteratorT output, ScanOpT
|
||||
::at_cuda_detail::cub::FutureValue<InitValueT>(first_elem_ptr),
|
||||
size_cub,
|
||||
at::cuda::getCurrentCUDAStream());
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -10,14 +10,6 @@
|
||||
#define CUB_VERSION 200001
|
||||
#endif
|
||||
|
||||
// cub sort support for __nv_bfloat16 is added to cub 1.13 in:
|
||||
// https://github.com/NVIDIA/cub/pull/306
|
||||
#if CUB_VERSION >= 101300
|
||||
#define CUB_SUPPORTS_NV_BFLOAT16() true
|
||||
#else
|
||||
#define CUB_SUPPORTS_NV_BFLOAT16() false
|
||||
#endif
|
||||
|
||||
// cub support for CUB_WRAPPED_NAMESPACE is added to cub 1.13.1 in:
|
||||
// https://github.com/NVIDIA/cub/pull/326
|
||||
// CUB_WRAPPED_NAMESPACE is defined globally in cmake/Dependencies.cmake
|
||||
@ -28,14 +20,6 @@
|
||||
#define USE_GLOBAL_CUB_WRAPPED_NAMESPACE() false
|
||||
#endif
|
||||
|
||||
// cub support for cub::FutureValue is added to cub 1.15 in:
|
||||
// https://github.com/NVIDIA/cub/pull/305
|
||||
#if CUB_VERSION >= 101500
|
||||
#define CUB_SUPPORTS_FUTURE_VALUE() true
|
||||
#else
|
||||
#define CUB_SUPPORTS_FUTURE_VALUE() false
|
||||
#endif
|
||||
|
||||
// There were many bc-breaking changes in major version release of CCCL v3.0.0
|
||||
// Please see https://nvidia.github.io/cccl/cccl/3.0_migration_guide.html
|
||||
#if CUB_VERSION >= 200800
|
||||
|
||||
23
aten/src/ATen/detail/XLAHooksInterface.cpp
Normal file
23
aten/src/ATen/detail/XLAHooksInterface.cpp
Normal file
@ -0,0 +1,23 @@
|
||||
#include <ATen/detail/XLAHooksInterface.h>
|
||||
|
||||
namespace at {
|
||||
namespace detail {
|
||||
|
||||
const XLAHooksInterface& getXLAHooks() {
|
||||
auto create_impl = [] {
|
||||
// Create XLA hooks using the registry
|
||||
auto hooks = XLAHooksRegistry()->Create("torch_xla::detail::XLAHooks", XLAHooksArgs{});
|
||||
if (hooks) {
|
||||
return hooks;
|
||||
}
|
||||
// If hooks creation fails, fall back to default implementation
|
||||
return std::make_unique<XLAHooksInterface>();
|
||||
};
|
||||
static auto hooks = create_impl();
|
||||
return *hooks;
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
C10_DEFINE_REGISTRY(XLAHooksRegistry, XLAHooksInterface, XLAHooksArgs)
|
||||
|
||||
} // namespace at
|
||||
79
aten/src/ATen/detail/XLAHooksInterface.h
Normal file
79
aten/src/ATen/detail/XLAHooksInterface.h
Normal file
@ -0,0 +1,79 @@
|
||||
#pragma once
|
||||
|
||||
#include <c10/core/Device.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/Registry.h>
|
||||
|
||||
#include <ATen/detail/AcceleratorHooksInterface.h>
|
||||
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-parameter")
|
||||
|
||||
namespace at {
|
||||
|
||||
constexpr const char* XLA_HELP =
|
||||
"This error has occurred because you are trying "
|
||||
"to use some XLA functionality, but the XLA library has not been "
|
||||
"loaded by the dynamic linker. You must load xla libraries by `import torch_xla`";
|
||||
|
||||
struct TORCH_API XLAHooksInterface : AcceleratorHooksInterface {
|
||||
~XLAHooksInterface() override = default;
|
||||
|
||||
void init() const override {
|
||||
TORCH_CHECK(false, "Cannot initialize XLA without torch_xla library. ", XLA_HELP);
|
||||
}
|
||||
|
||||
virtual bool hasXLA() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual std::string showConfig() const {
|
||||
TORCH_CHECK(
|
||||
false,
|
||||
"Cannot query detailed XLA version without torch_xla library. ",
|
||||
XLA_HELP);
|
||||
}
|
||||
|
||||
const Generator& getDefaultGenerator(
|
||||
[[maybe_unused]] DeviceIndex device_index = -1) const override {
|
||||
TORCH_CHECK(
|
||||
false, "Cannot get default XLA generator without torch_xla library. ", XLA_HELP);
|
||||
}
|
||||
|
||||
Generator getNewGenerator(
|
||||
[[maybe_unused]] DeviceIndex device_index = -1) const override {
|
||||
TORCH_CHECK(false, "Cannot get XLA generator without torch_xla library. ", XLA_HELP);
|
||||
}
|
||||
|
||||
virtual DeviceIndex getCurrentDevice() const override {
|
||||
TORCH_CHECK(false, "Cannot get current XLA device without torch_xla library. ", XLA_HELP);
|
||||
}
|
||||
|
||||
Device getDeviceFromPtr(void* /*data*/) const override {
|
||||
TORCH_CHECK(false, "Cannot get device of pointer on XLA without torch_xla library. ", XLA_HELP);
|
||||
}
|
||||
|
||||
Allocator* getPinnedMemoryAllocator() const override {
|
||||
TORCH_CHECK(false, "Cannot get XLA pinned memory allocator without torch_xla library. ", XLA_HELP);
|
||||
}
|
||||
|
||||
bool isPinnedPtr(const void* data) const override {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool hasPrimaryContext(DeviceIndex device_index) const override {
|
||||
TORCH_CHECK(false, "Cannot query primary context without torch_xla library. ", XLA_HELP);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
struct TORCH_API XLAHooksArgs {};
|
||||
|
||||
TORCH_DECLARE_REGISTRY(XLAHooksRegistry, XLAHooksInterface, XLAHooksArgs);
|
||||
#define REGISTER_XLA_HOOKS(clsname) \
|
||||
C10_REGISTER_CLASS(XLAHooksRegistry, clsname, clsname)
|
||||
|
||||
namespace detail {
|
||||
TORCH_API const XLAHooksInterface& getXLAHooks();
|
||||
} // namespace detail
|
||||
} // namespace at
|
||||
C10_DIAGNOSTIC_POP()
|
||||
@ -3620,7 +3620,7 @@ Tensor& _int_mm_out_cpu(const Tensor& self, const Tensor& mat2, Tensor& result)
|
||||
try {
|
||||
mkldnn_matmul_i8i8i32(self, mat2, result);
|
||||
dispatched = true;
|
||||
} catch (const std::exception& e) {
|
||||
} catch ([[maybe_unused]] const std::exception& e) {
|
||||
TORCH_WARN(func_name, " failed, switching to BLAS gemm: ", e.what());
|
||||
}
|
||||
}
|
||||
|
||||
@ -11,6 +11,8 @@ inline void check_pixel_shuffle_shapes(const Tensor& self, int64_t upscale_facto
|
||||
"pixel_shuffle expects a positive upscale_factor, but got ",
|
||||
upscale_factor);
|
||||
int64_t c = self.size(-3);
|
||||
TORCH_CHECK_VALUE(upscale_factor <= std::numeric_limits<decltype(upscale_factor)>::max() / upscale_factor,
|
||||
"upscale factor is too large, (upscale_factor)^2 overflowed: upscale_factor=", upscale_factor);
|
||||
int64_t upscale_factor_squared = upscale_factor * upscale_factor;
|
||||
TORCH_CHECK(c % upscale_factor_squared == 0,
|
||||
"pixel_shuffle expects its input's 'channel' dimension to be divisible by the square of "
|
||||
|
||||
@ -259,11 +259,20 @@ inline void winograd_f2k3_input_transform_inplace__rvv(
|
||||
const vfloat32m1_t wd1 = __riscv_vfadd_vv_f32m1(d1, d2, 4);
|
||||
const vfloat32m1_t wd2 = __riscv_vfsub_vv_f32m1(d2, d1, 4);
|
||||
const vfloat32m1_t wd3 = __riscv_vfsub_vv_f32m1(d1, d3, 4);
|
||||
|
||||
*input_tile_val = __riscv_vset_v_f32m1_f32m1x4(*input_tile_val, 0, wd0);
|
||||
*input_tile_val = __riscv_vset_v_f32m1_f32m1x4(*input_tile_val, 1, wd1);
|
||||
*input_tile_val = __riscv_vset_v_f32m1_f32m1x4(*input_tile_val, 2, wd2);
|
||||
*input_tile_val = __riscv_vset_v_f32m1_f32m1x4(*input_tile_val, 3, wd3);
|
||||
/* GCC 14.2 (RISC-V RVV) ICE workaround:
|
||||
* Avoid single-statement read-modify-write on MEM_REF like:
|
||||
* *input_tile_val =
|
||||
* __riscv_vset_v_f32m1_f32m1x4(*input_tile_val, idx, val);
|
||||
* This triggers an ICE during GIMPLE lower (gsi_replace / riscv_gimple_fold_builtin)
|
||||
* with -march=rv64gcv. Use a temporary then write back.
|
||||
* Do NOT refactor into the single-statement form. Clang is unaffected.
|
||||
*/
|
||||
vfloat32m1x4_t tmp_input_tile_val = *input_tile_val;
|
||||
tmp_input_tile_val = __riscv_vset_v_f32m1_f32m1x4(tmp_input_tile_val, 0, wd0);
|
||||
tmp_input_tile_val = __riscv_vset_v_f32m1_f32m1x4(tmp_input_tile_val, 1, wd1);
|
||||
tmp_input_tile_val = __riscv_vset_v_f32m1_f32m1x4(tmp_input_tile_val, 2, wd2);
|
||||
tmp_input_tile_val = __riscv_vset_v_f32m1_f32m1x4(tmp_input_tile_val, 3, wd3);
|
||||
*input_tile_val = tmp_input_tile_val;
|
||||
}
|
||||
|
||||
inline void winograd_f2k3_output_transform_inplace__rvv(
|
||||
@ -277,9 +286,15 @@ inline void winograd_f2k3_output_transform_inplace__rvv(
|
||||
const vfloat32m1_t wm0 = __riscv_vfadd_vv_f32m1(m0_plus_m1, m2, 4);
|
||||
const vfloat32m1_t m1_sub_m2 = __riscv_vfsub_vv_f32m1(m1, m2, 4);
|
||||
const vfloat32m1_t wm1 = __riscv_vfsub_vv_f32m1(m1_sub_m2, m3, 4);
|
||||
|
||||
*input_tile_val = __riscv_vset_v_f32m1_f32m1x4(*input_tile_val, 0, wm0);
|
||||
*input_tile_val = __riscv_vset_v_f32m1_f32m1x4(*input_tile_val, 1, wm1);
|
||||
/* GCC 14.2 (RISC-V RVV) ICE workaround — see note above.
|
||||
* Keep the temporary + write-back pattern to avoid ICE.
|
||||
* Do NOT rewrite into:
|
||||
* *input_tile_val = __riscv_vset_v_f32m1_f32m1x4(*input_tile_val, idx, val);
|
||||
*/
|
||||
vfloat32m1x4_t tmp_output_tile_val = *input_tile_val;
|
||||
tmp_output_tile_val = __riscv_vset_v_f32m1_f32m1x4(tmp_output_tile_val, 0, wm0);
|
||||
tmp_output_tile_val = __riscv_vset_v_f32m1_f32m1x4(tmp_output_tile_val, 1, wm1);
|
||||
*input_tile_val = tmp_output_tile_val;
|
||||
}
|
||||
|
||||
inline vfloat32m1_t
|
||||
@ -300,11 +315,17 @@ inline void winograd_f2k3_kernel_transform__rvv(
|
||||
const vfloat32m1_t const_half = __riscv_vfmv_v_f_f32m1(0.5f, 4);
|
||||
const vfloat32m1_t g0_plus_g2 = __riscv_vfadd_vv_f32m1(g0, g2, 4);
|
||||
vfloat32m1_t half_g0_plus_g2 = __riscv_vfmul_vv_f32m1(const_half, g0_plus_g2, 4);
|
||||
|
||||
*transform = __riscv_vset_v_f32m1_f32m1x4(*transform, 0, g0);
|
||||
*transform = __riscv_vset_v_f32m1_f32m1x4(*transform, 1, vmuladdq_f32(half_g0_plus_g2, const_half, g1));
|
||||
*transform = __riscv_vset_v_f32m1_f32m1x4(*transform, 2, vmulsubq_f32(half_g0_plus_g2, const_half, g1));
|
||||
*transform = __riscv_vset_v_f32m1_f32m1x4(*transform, 3, g2);
|
||||
/* GCC 14.2 (RISC-V RVV) ICE workaround — see note above.
|
||||
* Keep the temporary + write-back pattern to avoid ICE.
|
||||
* Do NOT rewrite into:
|
||||
* *transform = __riscv_vset_v_f32m1_f32m1x4(*transform, idx, val);
|
||||
*/
|
||||
vfloat32m1x4_t tmp_transform = *transform;
|
||||
tmp_transform = __riscv_vset_v_f32m1_f32m1x4(tmp_transform, 0, g0);
|
||||
tmp_transform = __riscv_vset_v_f32m1_f32m1x4(tmp_transform, 1, vmuladdq_f32(half_g0_plus_g2, const_half, g1));
|
||||
tmp_transform = __riscv_vset_v_f32m1_f32m1x4(tmp_transform, 2, vmulsubq_f32(half_g0_plus_g2, const_half, g1));
|
||||
tmp_transform = __riscv_vset_v_f32m1_f32m1x4(tmp_transform, 3, g2);
|
||||
*transform = tmp_transform;
|
||||
}
|
||||
|
||||
inline vfloat32m1x4_t v4f_transpose4x4__rvv(const vfloat32m1x4_t m) {
|
||||
|
||||
@ -272,28 +272,110 @@ cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activa
|
||||
}
|
||||
}
|
||||
|
||||
static bool getDisableAddmmCudaLt() {
|
||||
static const auto env_value = c10::utils::get_env("DISABLE_ADDMM_CUDA_LT");
|
||||
if (env_value == "1") {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
/*
|
||||
* Checks whether DISABLE_ADDMM_CUDA_LT is set.
|
||||
* Additionally, for ROCM we test whether the architecture supports the Lt.
|
||||
*/
|
||||
static bool isGloballyDisabledAddmmCudaLt(const at::Device& device) {
|
||||
// When hipBLASLt is not supported on the architecture, return true
|
||||
#ifdef USE_ROCM
|
||||
static const std::vector<std::string> archs = {
|
||||
"gfx90a", "gfx942",
|
||||
#if ROCM_VERSION >= 60300
|
||||
"gfx1100", "gfx1101", "gfx1200", "gfx1201", "gfx908",
|
||||
#endif
|
||||
#if ROCM_VERSION >= 70000
|
||||
"gfx950", "gfx1150", "gfx1151"
|
||||
#endif
|
||||
};
|
||||
const auto is_hipblas_lt_arch_supported = at::detail::getCUDAHooks().isGPUArch(archs, device.index());
|
||||
if (!is_hipblas_lt_arch_supported) {
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Check whether it is disabled in the env
|
||||
static const auto is_addmm_cuda_lt_disabled = c10::utils::get_env("DISABLE_ADDMM_CUDA_LT");
|
||||
if (is_addmm_cuda_lt_disabled == "1") {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifdef USE_ROCM
|
||||
static bool isSupportedHipLtROCmArch(int index) {
|
||||
static const std::vector<std::string> archs = {
|
||||
"gfx90a", "gfx942",
|
||||
#if ROCM_VERSION >= 60300
|
||||
"gfx1100", "gfx1101", "gfx1200", "gfx1201", "gfx908",
|
||||
#endif
|
||||
#if ROCM_VERSION >= 70000
|
||||
"gfx950", "gfx1150", "gfx1151"
|
||||
#endif
|
||||
};
|
||||
return at::detail::getCUDAHooks().isGPUArch(archs, index);
|
||||
/*
|
||||
* Check whether for the given input we want to enable the Lt interface
|
||||
*/
|
||||
static bool isInputCompliesAddmmCudaLt(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha) {
|
||||
// Implies 2D bias which we currently not send through Lt.
|
||||
// TODO: this check is done pre col-major input preparation,
|
||||
// so, this condition can be ralexed in cases when a col-major
|
||||
// copy of result is needed.
|
||||
if (result.is_same(self)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
#if defined(USE_ROCM) && ROCM_VERSION == 60400
|
||||
// hipblaslt TT fp32 regression on ROCm 6.4, cannot use
|
||||
const auto args = cublasCommonArgs(mat1, mat2, result);
|
||||
if (args.transa == 't' && args.transb == 't') {
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
const auto mat1_sizes = mat1.sizes();
|
||||
const auto mat2_sizes = mat2.sizes();
|
||||
#if defined(CUDA_VERSION) || defined(USE_ROCM)
|
||||
const auto scalar_type = mat1.scalar_type();
|
||||
return (beta.toComplexDouble() == 1.0
|
||||
// self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1]
|
||||
// is to use lt interface only when self is bias.
|
||||
&& self.dim() == 1 && self.sizes()[0] == mat2_sizes[1] && self.is_contiguous()
|
||||
&& result.dim() == 2 && result.is_contiguous()
|
||||
&& ( // some dtype restrictions
|
||||
#ifndef USE_ROCM
|
||||
scalar_type == at::ScalarType::Double ||
|
||||
#endif
|
||||
scalar_type == at::ScalarType::Float ||
|
||||
scalar_type == at::ScalarType::Half ||
|
||||
scalar_type == at::ScalarType::BFloat16
|
||||
)
|
||||
&& ( // some shape/stride restrictions
|
||||
// Strangely, if mat2 has only 1 row or column, we get
|
||||
// CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic.
|
||||
// NOTE: extension to mat1 because mat1/mat2 can be swapped based off
|
||||
// their row-/col-majorness.
|
||||
mat1_sizes[0] > 1 && mat1_sizes[1] > 1 &&
|
||||
mat2_sizes[0] > 1 && mat2_sizes[1] > 1
|
||||
// The last conditions is to skip 16b transA and non-trans-B having
|
||||
// leading dim >> rows when they are sliced from a large tensor
|
||||
// see fbcode/caffe2/test/test_linalg.py:test_corner_cases_of_cublasltmatmul
|
||||
#if !(defined(CUDA_VERSION) && CUDA_VERSION >= 12010 || defined(USE_ROCM))
|
||||
// Related to avoiding the leading stride >> leading dim problematic case
|
||||
// with 16b dtypes described above. For such dtypes we only allow inputs
|
||||
// which are either row- or col-major (i.e. non-overlapping, compact memory layout).
|
||||
// In that case the leading stride will be equal to the outer dim len.
|
||||
// Why do we catch this case here? The following `prepare_matrix_for_cublas` method
|
||||
// does not modify inputs as long as there is a stride of length 1
|
||||
// and the leading stride is at least max(1, other dim length), so we might
|
||||
// end up with contiguous cols but not rows (i.e. holes between different rows)
|
||||
// and vice versa.
|
||||
mat2_sizes[0] < 65535 * 32 && mat2_sizes[1] < 65535 * 32 &&
|
||||
mat1_sizes[0] < 65535 * 32 && mat1_sizes[1] < 65535 * 32 &&
|
||||
&& (
|
||||
// filter by dtype
|
||||
(scalar_type != at::ScalarType::Half && scalar_type != at::ScalarType::BFloat16) ||
|
||||
// check mat1/mat2 is row-/col-major
|
||||
(mat1.is_non_overlapping_and_dense() && mat2.is_non_overlapping_and_dense())
|
||||
)
|
||||
#endif
|
||||
)
|
||||
);
|
||||
#endif
|
||||
|
||||
// no compliance by default
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename scalar_t>
|
||||
void launchTunableGemmAndBias(cublasCommonArgs &args, const Scalar& alpha, const scalar_t* bias, cuda::blas::GEMMAndBiasActivationEpilogue activation) {
|
||||
@ -335,7 +417,70 @@ void launchTunableGemmAndBias(cublasCommonArgs &args, const Scalar& alpha, const
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename res_scalar_t = scalar_t>
|
||||
bool launchGemmAndBiasCublasLt(
|
||||
// args contains result which is modified
|
||||
cublasCommonArgs& args,
|
||||
const Tensor& self,
|
||||
const Scalar& alpha,
|
||||
Activation activation = Activation::None
|
||||
) {
|
||||
const auto* self_ptr = self.const_data_ptr<scalar_t>();
|
||||
|
||||
const auto tuning_ctx = at::cuda::tunable::getTuningContext();
|
||||
if (tuning_ctx->IsTunableOpEnabled()) {
|
||||
// TODO: maybe also return some success state?
|
||||
launchTunableGemmAndBias<scalar_t>(
|
||||
args, alpha, self_ptr, activation_to_gemm_and_blas_arg(activation)
|
||||
);
|
||||
return true;
|
||||
}
|
||||
|
||||
return at::cuda::blas::gemm_and_bias<scalar_t, res_scalar_t>(
|
||||
args.transa == 't',
|
||||
args.transb == 't',
|
||||
args.m,
|
||||
args.n,
|
||||
args.k,
|
||||
alpha.to<at::opmath_type<scalar_t>>(),
|
||||
args.mata->const_data_ptr<scalar_t>(),
|
||||
args.lda,
|
||||
args.matb->const_data_ptr<scalar_t>(),
|
||||
args.ldb,
|
||||
self_ptr,
|
||||
args.result->data_ptr<res_scalar_t>(),
|
||||
args.result_ld,
|
||||
activation_to_gemm_and_blas_arg(activation)
|
||||
);
|
||||
}
|
||||
|
||||
template <typename scalar_t, typename res_scalar_t = scalar_t>
|
||||
bool launchGemmCublas(
|
||||
// args contains result which is modified
|
||||
cublasCommonArgs& args,
|
||||
const Scalar& alpha,
|
||||
const Scalar& beta
|
||||
) {
|
||||
at::cuda::blas::gemm<scalar_t, res_scalar_t>(
|
||||
args.transa,
|
||||
args.transb,
|
||||
args.m,
|
||||
args.n,
|
||||
args.k,
|
||||
alpha.to<at::opmath_type<scalar_t>>(),
|
||||
args.mata->const_data_ptr<scalar_t>(),
|
||||
args.lda,
|
||||
args.matb->const_data_ptr<scalar_t>(),
|
||||
args.ldb,
|
||||
beta.to<at::opmath_type<scalar_t>>(),
|
||||
args.result->data_ptr<res_scalar_t>(),
|
||||
args.result_ld
|
||||
);
|
||||
return true; // success!
|
||||
}
|
||||
|
||||
Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha, Activation activation=Activation::None, bool disable_addmm_cuda_lt_override=false) {
|
||||
// Shape checks {
|
||||
// Make sure to keep addmm_cuda below in sync with this code; it
|
||||
// preflights a check to try to avoid actually needing to call
|
||||
// expand().
|
||||
@ -345,105 +490,62 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
|
||||
"expected mat1 and mat2 to have the same dtype, but got: ", mat1.dtype(), " != ", mat2.dtype()
|
||||
)
|
||||
|
||||
if (result.is_same(self)) {
|
||||
TORCH_CHECK(result.dim() == 2, "tensors must be 2-D");
|
||||
TORCH_CHECK(self.sizes()[0] == mat1.sizes()[0], "self dim 0 must match mat1 dim 0");
|
||||
TORCH_CHECK(self.sizes()[1] == mat2.sizes()[1], "self dim 1 must match mat2 dim 1");
|
||||
}
|
||||
// } Shape checks
|
||||
|
||||
// NOLINTNEXTLINE(*c-array*)
|
||||
TensorArg targs[]{{result, "out", 0}, {self, "self", 1}, {mat1, "mat1", 2}, {mat2, "mat2", 3}};
|
||||
checkAllSameGPU(__func__, targs);
|
||||
|
||||
IntArrayRef mat1_sizes = mat1.sizes();
|
||||
IntArrayRef mat2_sizes = mat2.sizes();
|
||||
IntArrayRef self__sizes;
|
||||
bool useLtInterface = false;
|
||||
#if defined(USE_ROCM)
|
||||
// When hipBLASLt is not supported on the architecture,
|
||||
// disable_addmm_cuda_lt will always be to set to true
|
||||
static bool disable_addmm_cuda_lt =
|
||||
!isSupportedHipLtROCmArch(self.device().index()) || getDisableAddmmCudaLt();
|
||||
#else
|
||||
static bool disable_addmm_cuda_lt = getDisableAddmmCudaLt();
|
||||
#endif
|
||||
// Handle whether to use the Lt interface {
|
||||
static bool persistent_disable_addmm_cuda_lt = isGloballyDisabledAddmmCudaLt(self.device());
|
||||
// if lt path fails, we recurse back into this function here and force the lt path to off
|
||||
// we cannot update varible disable_addmm_cuda_lt from above since it is static and would be permanent
|
||||
bool disable_addmm_cuda_lt_final = disable_addmm_cuda_lt || disable_addmm_cuda_lt_override;
|
||||
#if defined(USE_ROCM) && ROCM_VERSION == 60400
|
||||
// hipblaslt TT fp32 regression on ROCm 6.4, cannot use
|
||||
cublasCommonArgs _args(mat1, mat2, result);
|
||||
if (_args.transa == 't' && _args.transb == 't') {
|
||||
disable_addmm_cuda_lt_final = true;
|
||||
}
|
||||
#endif
|
||||
bool disable_addmm_cuda_lt = persistent_disable_addmm_cuda_lt || disable_addmm_cuda_lt_override;
|
||||
#ifdef USE_ROCM
|
||||
// Conditioned on the device index, which is not persistent
|
||||
disable_addmm_cuda_lt = isGloballyDisabledAddmmCudaLt(self.device()) || disable_addmm_cuda_lt;
|
||||
#endif
|
||||
// Condition on the input
|
||||
disable_addmm_cuda_lt = !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha) || disable_addmm_cuda_lt;
|
||||
// }
|
||||
|
||||
at::ScalarType scalar_type = mat1.scalar_type();
|
||||
bool is_float_output_with_half_input = (scalar_type == at::ScalarType::Half || scalar_type == at::ScalarType::BFloat16) && result.scalar_type() == at::ScalarType::Float;
|
||||
c10::MaybeOwned<Tensor> self_;
|
||||
if (&result != &self) {
|
||||
#if defined(CUDA_VERSION) || defined(USE_ROCM)
|
||||
// Strangely, if mat2 has only 1 row or column, we get
|
||||
// CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic.
|
||||
// self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1]
|
||||
// is to use lt interface only when self is bias.
|
||||
// for cuda 11.4, cublasLtMatmul is activated
|
||||
// the last two conditions is to skip 16b transA and non-trans-B having
|
||||
// leading dim >> rows when they are sliced from a large tensor
|
||||
// see fbcode/caffe2/test/test_linalg.py:test_corner_cases_of_cublasltmatmul
|
||||
if (!disable_addmm_cuda_lt_final) {
|
||||
useLtInterface = beta.toComplexDouble() == 1.0 && self.dim() == 1 &&
|
||||
result.dim() == 2 && self.sizes()[0] == mat2_sizes[1] &&
|
||||
self.is_contiguous() && result.is_contiguous() &&
|
||||
#ifdef USE_ROCM
|
||||
(scalar_type == at::ScalarType::Float ||
|
||||
scalar_type == at::ScalarType::Half ||
|
||||
scalar_type == at::ScalarType::BFloat16) &&
|
||||
#else
|
||||
(scalar_type == at::ScalarType::Double ||
|
||||
scalar_type == at::ScalarType::Float ||
|
||||
scalar_type == at::ScalarType::Half ||
|
||||
scalar_type == at::ScalarType::BFloat16) &&
|
||||
#endif
|
||||
#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12010 || defined(USE_ROCM))
|
||||
mat2_sizes[0] > 1 && mat2_sizes[1] > 1;
|
||||
#else
|
||||
mat2_sizes[0] > 1 && mat2_sizes[1] > 1 &&
|
||||
mat2_sizes[0] < 65535 * 32 && mat2_sizes[1] < 65535 * 32 &&
|
||||
mat1_sizes[0] < 65535 * 32 && mat1_sizes[1] < 65535 * 32 &&
|
||||
// avoid leading dim >> rows bugs
|
||||
((mat1.strides()[0] == 1 && mat1.strides()[1] == mat1_sizes[0]) ||
|
||||
(mat1.strides()[1] == 1 && mat1.strides()[0] == mat1_sizes[1]) ||
|
||||
(scalar_type != at::ScalarType::Half &&
|
||||
scalar_type != at::ScalarType::BFloat16)) &&
|
||||
((mat2.strides()[0] == 1 && mat2.strides()[1] == mat2_sizes[0]) ||
|
||||
(mat2.strides()[1] == 1 && mat2.strides()[0] == mat2_sizes[1]) ||
|
||||
(scalar_type != at::ScalarType::Half &&
|
||||
scalar_type != at::ScalarType::BFloat16));
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
if (!useLtInterface) {
|
||||
self_ = expand_size(self, {mat1_sizes[0], mat2_sizes[1]}, "addmm");
|
||||
}
|
||||
self__sizes = self_->sizes();
|
||||
} else {
|
||||
self_ = c10::MaybeOwned<Tensor>::borrowed(self);
|
||||
self__sizes = self_->sizes();
|
||||
TORCH_CHECK(result.dim() == 2, "tensors must be 2-D");
|
||||
TORCH_CHECK(self__sizes[0] == mat1_sizes[0], "self_ dim 0 must match mat1 dim 0");
|
||||
TORCH_CHECK(self__sizes[1] == mat2_sizes[1], "self_ dim 1 must match mat2 dim 1");
|
||||
}
|
||||
|
||||
if (&result != &self) {
|
||||
at::native::resize_output(result, {mat1_sizes[0], mat2_sizes[1]});
|
||||
if (beta.toComplexDouble() != 0.0 && !useLtInterface) {
|
||||
at::native::copy_(result, *self_);
|
||||
// Handle result/self shapes
|
||||
if (!result.is_same(self)) {
|
||||
at::native::resize_output(result, {mat1.sizes()[0], mat2.sizes()[1]});
|
||||
|
||||
const auto self_maybe_expanded = [&]() -> c10::MaybeOwned<Tensor> {
|
||||
if (disable_addmm_cuda_lt) {
|
||||
// When in non-Lt path we do expand self even before
|
||||
// check for beta != 0.0 to make sure that
|
||||
// test_sparse_csr.py::TestSparseCSRCUDA::test_addmm_errors_*
|
||||
// runs green.
|
||||
return expand_size(self, result.sizes(), "addmm");
|
||||
}
|
||||
// copy next, should broadcast
|
||||
return c10::MaybeOwned<Tensor>::borrowed(self);
|
||||
}();
|
||||
// We copy bias when in the non-Lt path
|
||||
if (beta.toComplexDouble() != 0.0 && disable_addmm_cuda_lt) {
|
||||
// NOTE: self should broadcast over result
|
||||
at::native::copy_(result, *self_maybe_expanded);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
IntArrayRef result_sizes = result.sizes();
|
||||
if ((result_sizes[0] == 0) || (result_sizes[1] == 0)) {
|
||||
// Short circuit on empty result
|
||||
if (result.numel() == 0) {
|
||||
return result;
|
||||
}
|
||||
|
||||
cublasCommonArgs args(mat1, mat2, result);
|
||||
|
||||
if (mat1.numel() == 0) {
|
||||
// Short circuit if the reduction dim is empty
|
||||
if (mat1.sizes()[1] == 0) {
|
||||
// By definition, when beta==0, values in self should be ignored. nans and infs
|
||||
// should not propagate
|
||||
if (beta.toComplexDouble() == 0.) {
|
||||
@ -455,158 +557,64 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
|
||||
result,
|
||||
self.expand(result.sizes()),
|
||||
at::native::scalar_tensor(
|
||||
beta,
|
||||
self.scalar_type(),
|
||||
std::nullopt /* layout */,
|
||||
at::kCPU,
|
||||
std::nullopt /* pin_memory */));
|
||||
beta,
|
||||
self.scalar_type(),
|
||||
std::nullopt /* layout */,
|
||||
at::kCPU,
|
||||
std::nullopt /* pin_memory */
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
cublasCommonArgs args(mat1, mat2, result);
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!args.result->is_conj());
|
||||
|
||||
if (useLtInterface) {
|
||||
#if defined(USE_ROCM)
|
||||
bool okay = true;
|
||||
// The Lt path
|
||||
if (!disable_addmm_cuda_lt) {
|
||||
bool lt_success = false;
|
||||
if (is_float_output_with_half_input) {
|
||||
#ifdef USE_ROCM
|
||||
TORCH_CHECK(false, "float output with half input is not enabled for ROCm");
|
||||
} else {
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(
|
||||
at::ScalarType::Half,
|
||||
at::ScalarType::BFloat16,
|
||||
scalar_type,
|
||||
"addmm_cuda_lt",
|
||||
[&] {
|
||||
auto tuning_ctx = at::cuda::tunable::getTuningContext();
|
||||
if (tuning_ctx->IsTunableOpEnabled()) {
|
||||
launchTunableGemmAndBias<scalar_t>(
|
||||
args,
|
||||
alpha,
|
||||
(&result != &self) ? self.const_data_ptr<scalar_t>() : nullptr,
|
||||
activation_to_gemm_and_blas_arg(activation));
|
||||
} else {
|
||||
okay = at::cuda::blas::gemm_and_bias<scalar_t>(
|
||||
args.transa == 't',
|
||||
args.transb == 't',
|
||||
args.m,
|
||||
args.n,
|
||||
args.k,
|
||||
alpha.to<at::opmath_type<scalar_t>>(),
|
||||
args.mata->const_data_ptr<scalar_t>(),
|
||||
args.lda,
|
||||
args.matb->const_data_ptr<scalar_t>(),
|
||||
args.ldb,
|
||||
// This condition is needed for mm case on ROCm for hipblasLt path.
|
||||
// Passing the bias ptr as null to avoid accuracy issues for mm case.
|
||||
(&result != &self) ? self.const_data_ptr<scalar_t>() : nullptr,
|
||||
args.result->data_ptr<scalar_t>(),
|
||||
args.result_ld,
|
||||
activation_to_gemm_and_blas_arg(activation)
|
||||
);
|
||||
}
|
||||
});
|
||||
}
|
||||
if (!okay) {
|
||||
// lt path failed; recurse but disable lt path
|
||||
return addmm_out_cuda_impl(result, self, mat1, mat2, beta, alpha, activation, true);
|
||||
}
|
||||
#else
|
||||
auto activation_epilogue = activation_to_gemm_and_blas_arg(activation);
|
||||
bool okay = true;
|
||||
if (is_float_output_with_half_input) {
|
||||
#else
|
||||
if (at::cuda::tunable::getTuningContext()->IsTunableOpEnabled()) {
|
||||
TORCH_CHECK(false, "Tunable GEMM is not supported for float output with reduced float input");
|
||||
}
|
||||
AT_DISPATCH_REDUCED_FLOATING_TYPES(
|
||||
scalar_type,
|
||||
"addmm_cuda_lt",
|
||||
[&] {
|
||||
auto tuning_ctx = at::cuda::tunable::getTuningContext();
|
||||
if (tuning_ctx->IsTunableOpEnabled()) {
|
||||
TORCH_CHECK(false, "Tunable GEMM is not supported for float output with reduced float input");
|
||||
lt_success = launchGemmAndBiasCublasLt<scalar_t, float>(args, self, alpha, activation);
|
||||
}
|
||||
else {
|
||||
okay = at::cuda::blas::gemm_and_bias<scalar_t, float>(
|
||||
args.transa == 't',
|
||||
args.transb == 't',
|
||||
args.m,
|
||||
args.n,
|
||||
args.k,
|
||||
alpha.to<at::opmath_type<scalar_t>>(),
|
||||
args.mata->const_data_ptr<scalar_t>(),
|
||||
args.lda,
|
||||
args.matb->const_data_ptr<scalar_t>(),
|
||||
args.ldb,
|
||||
self.const_data_ptr<scalar_t>(),
|
||||
args.result->data_ptr<float>(),
|
||||
args.result_ld,
|
||||
activation_epilogue
|
||||
);
|
||||
}});
|
||||
);
|
||||
#endif
|
||||
} else {
|
||||
// !is_float_output_with_half_input
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(
|
||||
at::ScalarType::Half,
|
||||
at::ScalarType::BFloat16,
|
||||
scalar_type,
|
||||
"addmm_cuda_lt",
|
||||
[&] {
|
||||
auto tuning_ctx = at::cuda::tunable::getTuningContext();
|
||||
if (tuning_ctx->IsTunableOpEnabled()) {
|
||||
launchTunableGemmAndBias<scalar_t>(
|
||||
args,
|
||||
alpha,
|
||||
self.const_data_ptr<scalar_t>(),
|
||||
activation_epilogue);
|
||||
lt_success = launchGemmAndBiasCublasLt<scalar_t>(args, self, alpha, activation);
|
||||
}
|
||||
else {
|
||||
okay = at::cuda::blas::gemm_and_bias<scalar_t>(
|
||||
args.transa == 't',
|
||||
args.transb == 't',
|
||||
args.m,
|
||||
args.n,
|
||||
args.k,
|
||||
alpha.to<at::opmath_type<scalar_t>>(),
|
||||
args.mata->const_data_ptr<scalar_t>(),
|
||||
args.lda,
|
||||
args.matb->const_data_ptr<scalar_t>(),
|
||||
args.ldb,
|
||||
self.const_data_ptr<scalar_t>(),
|
||||
args.result->data_ptr<scalar_t>(),
|
||||
args.result_ld,
|
||||
activation_epilogue
|
||||
);
|
||||
}});
|
||||
}
|
||||
if (!okay) {
|
||||
// lt path failed; recurse but disable lt path
|
||||
);
|
||||
} // end is_float_output_with_half_input
|
||||
|
||||
if (!lt_success) {
|
||||
// lt path failed; recurse but disable lt path
|
||||
return addmm_out_cuda_impl(result, self, mat1, mat2, beta, alpha, activation, true);
|
||||
}
|
||||
#endif
|
||||
} else
|
||||
{
|
||||
// end Lt path
|
||||
} else {
|
||||
// No Lt, we use a GEMM instead
|
||||
if (is_float_output_with_half_input) {
|
||||
AT_DISPATCH_REDUCED_FLOATING_TYPES(
|
||||
scalar_type,
|
||||
"addmm_cuda",
|
||||
[&] {
|
||||
using opmath_t = at::opmath_type<scalar_t>;
|
||||
opmath_t alpha_val = alpha.to<opmath_t>();
|
||||
opmath_t beta_val = beta.to<opmath_t>();
|
||||
const scalar_t* mat1_ptr = args.mata->const_data_ptr<scalar_t>();
|
||||
const scalar_t* mat2_ptr = args.matb->const_data_ptr<scalar_t>();
|
||||
|
||||
float* result_ptr = args.result->mutable_data_ptr<float>();
|
||||
at::cuda::blas::gemm<scalar_t, float>(
|
||||
args.transa,
|
||||
args.transb,
|
||||
args.m,
|
||||
args.n,
|
||||
args.k,
|
||||
alpha_val,
|
||||
mat1_ptr,
|
||||
args.lda,
|
||||
mat2_ptr,
|
||||
args.ldb,
|
||||
beta_val,
|
||||
result_ptr,
|
||||
args.result_ld);
|
||||
});
|
||||
launchGemmCublas<scalar_t, float>(args, alpha, beta);
|
||||
}
|
||||
);
|
||||
} else {
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
|
||||
at::ScalarType::Half,
|
||||
@ -614,28 +622,12 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
|
||||
scalar_type,
|
||||
"addmm_cuda",
|
||||
[&] {
|
||||
using opmath_t = at::opmath_type<scalar_t>;
|
||||
opmath_t alpha_val = alpha.to<opmath_t>();
|
||||
opmath_t beta_val = beta.to<opmath_t>();
|
||||
const scalar_t* mat1_ptr = args.mata->const_data_ptr<scalar_t>();
|
||||
const scalar_t* mat2_ptr = args.matb->const_data_ptr<scalar_t>();
|
||||
scalar_t* result_ptr = args.result->mutable_data_ptr<scalar_t>();
|
||||
at::cuda::blas::gemm<scalar_t>(
|
||||
args.transa,
|
||||
args.transb,
|
||||
args.m,
|
||||
args.n,
|
||||
args.k,
|
||||
alpha_val,
|
||||
mat1_ptr,
|
||||
args.lda,
|
||||
mat2_ptr,
|
||||
args.ldb,
|
||||
beta_val,
|
||||
result_ptr,
|
||||
args.result_ld);
|
||||
});
|
||||
launchGemmCublas<scalar_t>(args, alpha, beta);
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
// Apply epilogue
|
||||
switch (activation) {
|
||||
case Activation::RELU:
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
|
||||
@ -647,14 +639,14 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
|
||||
break;
|
||||
default: break;
|
||||
}
|
||||
}
|
||||
} // end GEMM path
|
||||
|
||||
// Preprocessor gate here needs to match the inverse of the check
|
||||
// gating activation_to_gemm_and_blas_arg above; here we are manually
|
||||
// performing a post-GELU because we weren't able to use the GELU
|
||||
// epilogue above.
|
||||
#if !defined(CUDA_VERSION) && !defined(USE_ROCM)
|
||||
if (useLtInterface && activation == Activation::GELU) {
|
||||
if (!disable_addmm_cuda_lt && activation == Activation::GELU) {
|
||||
at::gelu_(const_cast<Tensor&>(*args.result), "tanh");
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -1,18 +1,17 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/OpMathType.h>
|
||||
#include <ATen/cuda/detail/OffsetCalculator.cuh>
|
||||
#include <ATen/detail/FunctionTraits.h>
|
||||
#include <ATen/native/TensorIterator.h>
|
||||
#include <ATen/native/TensorIteratorDynamicCasting.h>
|
||||
#include <ATen/cuda/detail/OffsetCalculator.cuh>
|
||||
#include <ATen/OpMathType.h>
|
||||
#include <ATen/native/cuda/thread_constants.h>
|
||||
|
||||
#include <thrust/tuple.h>
|
||||
|
||||
#include <ATen/native/cuda/MemoryAccess.cuh>
|
||||
|
||||
#include <tuple>
|
||||
|
||||
|
||||
|
||||
namespace at::native {
|
||||
|
||||
template<int N>
|
||||
@ -62,7 +61,11 @@ __device__ inline void elementwise_kernel_helper(func_t f, policy_t policy) {
|
||||
#pragma unroll
|
||||
for (int i = 0; i < elems_per_thread; i++) {
|
||||
if (policy.check_inbounds(i)) {
|
||||
#if defined(__HIP__)
|
||||
results[i] = c10::guts::apply(f, args[i]);
|
||||
#else
|
||||
results[i] = std::apply(f, args[i]);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -23,7 +23,7 @@ namespace at::native {
|
||||
|
||||
// The maximum number of threads in a block
|
||||
#if defined(USE_ROCM)
|
||||
constexpr int MAX_BLOCK_SIZE = 256;
|
||||
constexpr int MAX_BLOCK_SIZE = 1024;
|
||||
#else
|
||||
constexpr int MAX_BLOCK_SIZE = 512;
|
||||
#endif
|
||||
@ -33,7 +33,7 @@ constexpr unsigned MAX_GRID_SIZE = 65535u;
|
||||
// Number of threads in a block given an input size up to MAX_BLOCK_SIZE
|
||||
static int getNumThreads(int nElem) {
|
||||
#if defined(USE_ROCM)
|
||||
int threadSizes[5] = { 16, 32, 64, 128, MAX_BLOCK_SIZE };
|
||||
int threadSizes[5] = { 64, 128, 256, 512, MAX_BLOCK_SIZE };
|
||||
#else
|
||||
int threadSizes[5] = { 32, 64, 128, 256, MAX_BLOCK_SIZE };
|
||||
#endif
|
||||
@ -115,9 +115,23 @@ __device__ scalar_t reduce(Op op, PTA tensor, int plane) {
|
||||
// first the reductions each thread does separately
|
||||
scalar_t sum = static_cast<scalar_t>(0);
|
||||
for (int batch = threadIdx.y; batch < tensor.size(0); batch += blockDim.y) {
|
||||
#if defined(USE_ROCM)
|
||||
constexpr int UNRL = 4; // load deserilize factor
|
||||
scalar_t tmp[UNRL];
|
||||
for (int x = threadIdx.x; x < tensor.size(2); x += blockDim.x*UNRL) {
|
||||
#pragma unroll
|
||||
for (int u = 0; u < UNRL; u++)
|
||||
tmp[u] = op(batch, plane, min((int)tensor.size(2)-1, (int)(x+u*blockDim.x)));
|
||||
#pragma unroll
|
||||
for (int u = 0; u < UNRL; u++)
|
||||
if (x+u*blockDim.x < tensor.size(2))
|
||||
sum += tmp[u];
|
||||
}
|
||||
#else
|
||||
for (int x = threadIdx.x; x < tensor.size(2); x += blockDim.x) {
|
||||
sum += op(batch, plane, x);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
__shared__ scalar_t shared[C10_WARP_SIZE];
|
||||
SumReduceOp<scalar_t> reduce_op;
|
||||
|
||||
@ -653,8 +653,14 @@ struct ReduceOp {
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Intra-warp reduction, fix CUDA to have offset decreasing for better numerics
|
||||
// matching Triton, etc.
|
||||
// TODO(PaulZhang12): AMD and internal
|
||||
#if defined(USE_ROCM) || defined(FBCODE_CAFFE2)
|
||||
for (int offset = 1; offset < dim_x; offset <<= 1) {
|
||||
#else
|
||||
for (int offset = dim_x >> 1; offset > 0; offset >>= 1) {
|
||||
#endif
|
||||
#pragma unroll
|
||||
for (int i = 0; i < output_vec_size; i++) {
|
||||
arg_t other = ops.warp_shfl_down(value[i], offset);
|
||||
|
||||
@ -92,6 +92,16 @@ inline thrust::pair<int64_t, int64_t> get_index_mapping2d(
|
||||
output_offset + output_y * output_dim_x + output_x);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ int64_t reflect_index(int64_t x, int64_t len) {
|
||||
const int64_t two = (len - 1) * 2;
|
||||
if (two <= 0) {
|
||||
return 0;
|
||||
}
|
||||
int64_t m = x % two;
|
||||
if (m < 0) m += two;
|
||||
return (m < len) ? m : (two - m);
|
||||
}
|
||||
|
||||
template<typename scalar_t>
|
||||
__global__ void reflection_pad1d_out_kernel(
|
||||
const scalar_t * input, scalar_t * output,
|
||||
@ -106,6 +116,28 @@ __global__ void reflection_pad1d_out_kernel(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__global__ void reflection_pad1d_flat(
|
||||
const scalar_t* __restrict__ input,
|
||||
scalar_t* __restrict__ output,
|
||||
int64_t input_w, int64_t pad_l, int64_t pad_r,
|
||||
int64_t out_w, int64_t plane_count) {
|
||||
|
||||
const int64_t bx = blockDim.x;
|
||||
const int64_t tx = threadIdx.x;
|
||||
|
||||
const int64_t total = plane_count * out_w;
|
||||
const int64_t grid_stride = static_cast<int64_t>(bx) * gridDim.x;
|
||||
int64_t linear = static_cast<int64_t>(blockIdx.x) * bx + tx;
|
||||
|
||||
for (; linear < total; linear += grid_stride) {
|
||||
const int64_t plane = linear / out_w;
|
||||
const int64_t x = linear - plane * out_w;
|
||||
const int64_t j = reflect_index(x - pad_l, input_w);
|
||||
output[plane * out_w + x] = input[plane * input_w + j];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__global__ void reflection_pad1d_backward_out_kernel(
|
||||
scalar_t * grad_input, const scalar_t * grad_output,
|
||||
@ -710,25 +742,44 @@ TORCH_IMPL_FUNC(reflection_pad1d_out_cuda)
|
||||
int64_t input_w = input_.size(dim_w);
|
||||
int64_t output_w = input_w + pad_l + pad_r;
|
||||
|
||||
dim3 block_size(output_w > 256 ? 256 : output_w);
|
||||
dim3 grid_size((int)::ceil(output_w / 256.0), nplane, nbatch);
|
||||
|
||||
Tensor input = input_.contiguous();
|
||||
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
|
||||
kHalf, kBFloat16, input.scalar_type(), "reflection_pad1d_out_template", [&] {
|
||||
reflection_pad1d_out_kernel<<<
|
||||
grid_size,
|
||||
block_size,
|
||||
0,
|
||||
at::cuda::getCurrentCUDAStream()>>>(
|
||||
input.const_data_ptr<scalar_t>(),
|
||||
output.mutable_data_ptr<scalar_t>(),
|
||||
input_w,
|
||||
pad_l,
|
||||
pad_r);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
});
|
||||
const int block_x = static_cast<int>(std::min<int64_t>(256, std::max<int64_t>(1, output_w)));
|
||||
const cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
|
||||
const int max_x = prop->maxGridSize[0];
|
||||
const int max_y = prop->maxGridSize[1];
|
||||
const int max_z = prop->maxGridSize[2];
|
||||
|
||||
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kHalf, kBFloat16, input.scalar_type(), "reflection_pad1d_out", [&] {
|
||||
auto stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
const int64_t gx = at::ceil_div(output_w, static_cast<int64_t>(block_x));
|
||||
|
||||
const bool fits3d = (nplane <= max_y) && (nbatch <= max_z) && (gx <= max_x);
|
||||
|
||||
if (fits3d) {
|
||||
dim3 block(block_x, 1, 1);
|
||||
dim3 grid(gx, static_cast<unsigned>(nplane), static_cast<unsigned>(nbatch));
|
||||
reflection_pad1d_out_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
||||
input.const_data_ptr<scalar_t>(),
|
||||
output.mutable_data_ptr<scalar_t>(),
|
||||
input_w, pad_l, pad_r);
|
||||
} else {
|
||||
dim3 block(block_x, 1, 1);
|
||||
const int64_t plane_count = nplane * nbatch;
|
||||
const int64_t total_blocks = at::ceil_div(plane_count * output_w, static_cast<int64_t>(block_x));
|
||||
const int grid_x = static_cast<int>(std::min<int64_t>(max_x, std::max<int64_t>(1, total_blocks)));
|
||||
dim3 grid(grid_x, 1, 1);
|
||||
|
||||
reflection_pad1d_flat<scalar_t><<<grid, block, 0, stream>>>(
|
||||
input.const_data_ptr<scalar_t>(),
|
||||
output.mutable_data_ptr<scalar_t>(),
|
||||
input_w, pad_l, pad_r, output_w, plane_count);
|
||||
}
|
||||
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
});
|
||||
}
|
||||
|
||||
TORCH_IMPL_FUNC(reflection_pad1d_backward_out_cuda)(const Tensor& grad_output_,
|
||||
|
||||
@ -44,7 +44,7 @@ __global__ void triu_tril_kernel(
|
||||
const int64_t k,
|
||||
const int64_t N_padded,
|
||||
const IndexType last_dim_padded) {
|
||||
int64_t linear_idx = (blockIdx.x * blockDim.x + threadIdx.x) * elements_per_thread;
|
||||
int64_t linear_idx = (((int64_t)blockIdx.x) * blockDim.x + threadIdx.x) * elements_per_thread;
|
||||
if (linear_idx >= N_padded) {
|
||||
return;
|
||||
}
|
||||
|
||||
@ -52,7 +52,7 @@ struct FusedAdagradMathFunctor {
|
||||
using opmath_t = at::opmath_type<scalar_t>;
|
||||
|
||||
C10_DEVICE __forceinline__ void operator()(
|
||||
int chunk_size,
|
||||
int64_t chunk_size,
|
||||
FusedOptimizerTensorListMetadata<3>& tl,
|
||||
const float* lr_ptr,
|
||||
const double& lr,
|
||||
@ -133,4 +133,4 @@ struct FusedAdagradMathFunctor {
|
||||
|
||||
} // namespace
|
||||
|
||||
} // namespace at::native
|
||||
} // namespace at::native
|
||||
|
||||
@ -466,7 +466,11 @@ struct ReduceJitOp {
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#if defined(USE_ROCM) || defined(FBCODE_CAFFE2)
|
||||
for (int offset = 1; offset < dim_x; offset <<= 1) {
|
||||
#else
|
||||
for (int offset = dim_x >> 1; offset > 0; offset >>= 1) {
|
||||
#endif
|
||||
#pragma unroll
|
||||
for (int i = 0; i < output_vec_size; i++) {
|
||||
arg_t other = reducer::warp_shfl_down(value[i], offset);
|
||||
|
||||
@ -441,7 +441,7 @@ kernel void applySYRK(
|
||||
uint3 tid [[thread_position_in_threadgroup]],
|
||||
uint3 tgid [[threadgroup_position_in_grid]],
|
||||
uint3 tpg [[threads_per_threadgroup]],
|
||||
uint sgitg [[simdgroup_index_in_threadgroup]]) {
|
||||
uint warp_id [[simdgroup_index_in_threadgroup]]) {
|
||||
const uint tx = tid.x;
|
||||
const uint ty = tid.y;
|
||||
const uint simdGroupsPerThreadgroup = (tpg.x * tpg.y + 31) / 32;
|
||||
@ -474,11 +474,8 @@ kernel void applySYRK(
|
||||
(actSize_j % 8 == 0) && (actSize_h % 8 == 0) && (actSize_k % 8 == 0);
|
||||
|
||||
if (use_simdgroup) {
|
||||
uint warp_id = sgitg;
|
||||
|
||||
simdgroup_matrix<float, 8, 8> negative_identity =
|
||||
simdgroup_matrix<float, 8, 8>(-1.0);
|
||||
simdgroup_matrix<float, 8, 8> identity = simdgroup_matrix<float, 8, 8>(1.0);
|
||||
simdgroup_matrix<float, 8, 8> Prod;
|
||||
simdgroup_matrix<float, 8, 8> Afrag;
|
||||
simdgroup_matrix<float, 8, 8> Bfrag;
|
||||
@ -521,8 +518,7 @@ kernel void applySYRK(
|
||||
/* transpose = */ upper);
|
||||
|
||||
simdgroup_multiply(Prod, Afrag, Bfrag);
|
||||
simdgroup_multiply(Prod, Prod, negative_identity);
|
||||
simdgroup_multiply_accumulate(Cfrag, Cfrag, identity, Prod);
|
||||
simdgroup_multiply_accumulate(Cfrag, Prod, negative_identity, Cfrag);
|
||||
}
|
||||
|
||||
simdgroup_store(
|
||||
|
||||
@ -92,13 +92,8 @@ static std::tuple<Tensor, Tensor> sdpa_general_mps(const Tensor& query,
|
||||
}
|
||||
|
||||
// upcasting to float32 if needed to improve precision when multiplying by the scale factor
|
||||
if ([maskedMM dataType] != MPSDataTypeFloat32) {
|
||||
maskedMM = [mpsGraph castTensor:maskedMM toType:MPSDataTypeFloat32 name:nil];
|
||||
}
|
||||
maskedMM = castMPSTensor(mpsGraph, maskedMM, MPSDataTypeFloat32);
|
||||
maskedMM = [mpsGraph multiplicationWithPrimaryTensor:maskedMM secondaryTensor:scaleTensor name:nil];
|
||||
if ([maskedMM dataType] != qTensor.dataType) {
|
||||
maskedMM = [mpsGraph castTensor:maskedMM toType:qTensor.dataType name:nil];
|
||||
}
|
||||
|
||||
if (is_causal) {
|
||||
auto causalMask = [mpsGraph constantWithScalar:1.0f
|
||||
@ -112,7 +107,9 @@ static std::tuple<Tensor, Tensor> sdpa_general_mps(const Tensor& query,
|
||||
name:nil];
|
||||
} else if (attn_mask) {
|
||||
graph->maskTensor = mpsGraphRankedPlaceHolder(mpsGraph, *attn_mask);
|
||||
maskedMM = [mpsGraph additionWithPrimaryTensor:maskedMM secondaryTensor:graph->maskTensor name:nil];
|
||||
maskedMM = [mpsGraph additionWithPrimaryTensor:maskedMM
|
||||
secondaryTensor:castMPSTensor(mpsGraph, graph->maskTensor, maskedMM.dataType)
|
||||
name:nil];
|
||||
}
|
||||
|
||||
// Account for case where all values were masked causing division by 0 in softmax (issue:#156707)
|
||||
@ -133,8 +130,8 @@ static std::tuple<Tensor, Tensor> sdpa_general_mps(const Tensor& query,
|
||||
graph->qTensor = qTensor;
|
||||
graph->kTensor = kTensor;
|
||||
graph->vTensor = vTensor;
|
||||
graph->outputTensor = output;
|
||||
graph->attnTensor = sm;
|
||||
graph->outputTensor = castMPSTensor(mpsGraph, output, qTensor.dataType);
|
||||
graph->attnTensor = castMPSTensor(mpsGraph, sm, qTensor.dataType);
|
||||
});
|
||||
auto qPlaceholder = Placeholder(cachedGraph->qTensor, query);
|
||||
auto kPlaceholder = Placeholder(cachedGraph->kTensor, key);
|
||||
|
||||
@ -338,6 +338,8 @@ static void linalg_lu_factor_ex_out_mps_impl(const Tensor& A,
|
||||
". See https://developer.apple.com/documentation/metalperformanceshaders/mpsmatrixdecompositionstatus for details.");
|
||||
}
|
||||
}
|
||||
|
||||
map_mps_decomposition_error_code_to_blas(info);
|
||||
}
|
||||
|
||||
static void linalg_solve_out_mps_impl(const Tensor& A,
|
||||
@ -1448,20 +1450,6 @@ TORCH_IMPL_FUNC(_linalg_solve_ex_out_mps)
|
||||
mps::linalg_solve_out_mps_impl(A, B, left, check_errors, result, LU, pivots, info);
|
||||
}
|
||||
|
||||
std::tuple<Tensor&, Tensor&> linalg_lu_factor_out_mps(const Tensor& A, bool pivot, Tensor& LU, Tensor& pivots) {
|
||||
Tensor info = at::empty({}, A.options().dtype(kInt));
|
||||
mps::linalg_lu_factor_ex_out_mps_impl(A, pivot, LU, pivots, info, false);
|
||||
return std::tie(LU, pivots);
|
||||
}
|
||||
|
||||
std::tuple<Tensor, Tensor> linalg_lu_factor_mps(const Tensor& A, bool pivot) {
|
||||
Tensor LU = at::empty({0}, A.options());
|
||||
Tensor pivots = at::empty({0}, A.options().dtype(kInt));
|
||||
Tensor info = at::empty({}, A.options().dtype(kInt));
|
||||
mps::linalg_lu_factor_ex_out_mps_impl(A, pivot, LU, pivots, info, false);
|
||||
return std::make_tuple(std::move(LU), std::move(pivots));
|
||||
}
|
||||
|
||||
TORCH_IMPL_FUNC(lu_unpack_out_mps)
|
||||
(const Tensor& LU_data,
|
||||
const Tensor& LU_pivots,
|
||||
|
||||
@ -706,6 +706,7 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
NestedTensorCPU, NestedTensorHPU, NestedTensorCUDA: NestedTensor_all
|
||||
tags: reduction
|
||||
|
||||
|
||||
- func: all.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
|
||||
@ -715,6 +716,7 @@
|
||||
cpp_no_default_args: ['dim']
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: all_dims_default
|
||||
tags: reduction
|
||||
|
||||
- func: all.out(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
@ -723,6 +725,7 @@
|
||||
CPU, CUDA: all_out
|
||||
MPS: all_out_mps
|
||||
MTIA: all_out_mtia
|
||||
tags: reduction
|
||||
|
||||
- func: all.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
@ -731,13 +734,16 @@
|
||||
CPU, CUDA: all_dims_out
|
||||
CompositeExplicitAutograd: all_dims_out_default
|
||||
cpp_no_default_args: ['dim']
|
||||
tags: reduction
|
||||
|
||||
- func: all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: reduction
|
||||
|
||||
- func: all.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
tags: reduction
|
||||
|
||||
- func: allclose(Tensor self, Tensor other, float rtol=1e-05, float atol=1e-08, bool equal_nan=False) -> bool
|
||||
variants: function, method
|
||||
@ -749,14 +755,14 @@
|
||||
device_check: NoCheck # TensorIterator
|
||||
structured_delegate: any.out
|
||||
variants: function, method
|
||||
tags: core
|
||||
tags: [core, reduction]
|
||||
|
||||
- func: any.dims(Tensor self, int[]? dim=None, bool keepdim=False) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
structured_delegate: any.dims_out
|
||||
variants: function, method
|
||||
cpp_no_default_args: ['dim']
|
||||
tags: core
|
||||
tags: [core, reduction]
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: any_dims_default
|
||||
|
||||
@ -766,6 +772,7 @@
|
||||
dispatch:
|
||||
CPU, CUDA: any_out
|
||||
MPS: any_out_mps
|
||||
tags: reduction
|
||||
|
||||
- func: any.dims_out(Tensor self, int[]? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
@ -774,13 +781,16 @@
|
||||
CPU, CUDA: any_dims_out
|
||||
CompositeExplicitAutograd: any_dims_out_default
|
||||
cpp_no_default_args: ['dim']
|
||||
tags: reduction
|
||||
|
||||
- func: any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: reduction
|
||||
|
||||
- func: any.dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
tags: reduction
|
||||
|
||||
- func: arange(Scalar end, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
|
||||
dispatch:
|
||||
@ -826,25 +836,27 @@
|
||||
structured_delegate: argmax.out
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: core
|
||||
tags: [core, reduction]
|
||||
|
||||
- func: argmax.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
structured: True
|
||||
dispatch:
|
||||
CPU, CUDA: argmax_out
|
||||
MPS: argmax_out_mps
|
||||
tags: reduction
|
||||
|
||||
- func: argmin(Tensor self, int? dim=None, bool keepdim=False) -> Tensor
|
||||
structured_delegate: argmin.out
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: core
|
||||
tags: [core, reduction]
|
||||
|
||||
- func: argmin.out(Tensor self, int? dim=None, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
structured: True
|
||||
dispatch:
|
||||
CPU, CUDA: argmin_out
|
||||
MPS: argmin_out_mps
|
||||
tags: reduction
|
||||
|
||||
- func: acosh(Tensor self) -> Tensor
|
||||
variants: function, method
|
||||
@ -1869,12 +1881,14 @@
|
||||
CUDA: count_nonzero_cuda
|
||||
MPS: count_nonzero_mps
|
||||
autogen: count_nonzero.dim_IntList_out
|
||||
tags: reduction
|
||||
|
||||
- func: count_nonzero(Tensor self, int? dim=None) -> Tensor
|
||||
variants: function, method
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: count_nonzero
|
||||
autogen: count_nonzero.out
|
||||
tags: reduction
|
||||
|
||||
- func: cov(Tensor self, *, int correction=1, Tensor? fweights=None, Tensor? aweights=None) -> Tensor
|
||||
variants: function, method
|
||||
@ -3795,19 +3809,23 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: logsumexp
|
||||
tags: reduction
|
||||
|
||||
- func: logsumexp.out(Tensor self, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
dispatch:
|
||||
# calls squeeze
|
||||
CompositeExplicitAutogradNonFunctional: logsumexp_out
|
||||
tags: reduction
|
||||
|
||||
- func: logsumexp.names(Tensor self, Dimname[1] dim, bool keepdim=False) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: reduction
|
||||
|
||||
- func: logsumexp.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
tags: reduction
|
||||
|
||||
- func: margin_ranking_loss(Tensor input1, Tensor input2, Tensor target, float margin=0.0, int reduction=Mean) -> Tensor
|
||||
|
||||
@ -3857,6 +3875,7 @@
|
||||
device_check: NoCheck # TensorIterator
|
||||
structured_delegate: aminmax.out
|
||||
variants: function, method
|
||||
tags: reduction
|
||||
|
||||
- func: aminmax.out(Tensor self, *, int? dim=None, bool keepdim=False, Tensor(a!) min, Tensor(b!) max) -> (Tensor(a!) min, Tensor(b!) max)
|
||||
device_check: NoCheck # TensorIterator
|
||||
@ -3864,6 +3883,7 @@
|
||||
dispatch:
|
||||
CPU, CUDA, MTIA: aminmax_out
|
||||
MPS: aminmax_out_mps
|
||||
tags: reduction
|
||||
|
||||
- func: _compute_linear_combination(Tensor input, Tensor coefficients) -> Tensor
|
||||
dispatch:
|
||||
@ -3879,7 +3899,7 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
QuantizedCPU, QuantizedCUDA: qmax
|
||||
tags: core
|
||||
tags: [core, reduction]
|
||||
|
||||
- func: max.dim_max(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
|
||||
device_check: NoCheck # TensorIterator
|
||||
@ -3889,13 +3909,16 @@
|
||||
dispatch:
|
||||
CPU, CUDA, MTIA: max_out
|
||||
MPS: max_out_mps
|
||||
tags: reduction
|
||||
|
||||
- func: max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: reduction
|
||||
|
||||
- func: max.names_dim_max(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) max, Tensor(b!) max_values) -> (Tensor(a!) values, Tensor(b!) indices)
|
||||
device_check: NoCheck # TensorIterator
|
||||
tags: reduction
|
||||
|
||||
- func: value_selecting_reduction_backward(Tensor grad, int dim, Tensor indices, SymInt[] sizes, bool keepdim) -> Tensor
|
||||
variants: function
|
||||
@ -3908,13 +3931,14 @@
|
||||
- func: amax(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
|
||||
variants: function, method
|
||||
structured_delegate: amax.out
|
||||
tags: core
|
||||
tags: [core, reduction]
|
||||
|
||||
- func: amax.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
structured: True
|
||||
dispatch:
|
||||
CPU, CUDA, MTIA: amax_out
|
||||
MPS: amax_out_mps
|
||||
tags: reduction
|
||||
|
||||
# Return: (Tensor output, Tensor indices)
|
||||
- func: max_pool1d_with_indices(Tensor self, int[1] kernel_size, int[1] stride=[], int[1] padding=0, int[1] dilation=1, bool ceil_mode=False) -> (Tensor, Tensor)
|
||||
@ -3976,13 +4000,14 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: mean
|
||||
tags: core
|
||||
tags: [core, reduction]
|
||||
|
||||
# For normal naming convention this should be `mean.out`. However since we already have `mean.out` we have to rename this.
|
||||
- func: mean.dtype_out(Tensor self, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: mean_dtype_out
|
||||
tags: reduction
|
||||
|
||||
- func: mean.dim(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
|
||||
structured_delegate: mean.out
|
||||
@ -3990,7 +4015,7 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
QuantizedCPU: mean_quantized_cpu
|
||||
tags: core
|
||||
tags: [core, reduction]
|
||||
|
||||
- func: mean.out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
|
||||
structured: True
|
||||
@ -3999,13 +4024,16 @@
|
||||
CPU, CUDA: mean_out
|
||||
MPS: mean_out_mps
|
||||
QuantizedCPU: mean_out_quantized_cpu
|
||||
tags: reduction
|
||||
|
||||
- func: mean.names_dim(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: reduction
|
||||
|
||||
- func: mean.names_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
tags: reduction
|
||||
|
||||
- func: nanmean(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
|
||||
device_check: NoCheck # Composite
|
||||
@ -4068,7 +4096,7 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
QuantizedCPU, QuantizedCUDA: qmin
|
||||
tags: core
|
||||
tags: [core, reduction]
|
||||
|
||||
- func: min.dim_min(Tensor self, int dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
|
||||
device_check: NoCheck # TensorIterator
|
||||
@ -4078,24 +4106,28 @@
|
||||
dispatch:
|
||||
CPU, CUDA, MTIA: min_out
|
||||
MPS: min_out_mps
|
||||
tags: reduction
|
||||
|
||||
- func: min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: reduction
|
||||
|
||||
- func: min.names_dim_min(Tensor self, Dimname dim, bool keepdim=False, *, Tensor(a!) min, Tensor(b!) min_indices) -> (Tensor(a!) values, Tensor(b!) indices)
|
||||
device_check: NoCheck # TensorIterator
|
||||
tags: reduction
|
||||
|
||||
- func: amin(Tensor self, int[1] dim=[], bool keepdim=False) -> Tensor
|
||||
variants: function, method
|
||||
structured_delegate: amin.out
|
||||
tags: core
|
||||
tags: [core, reduction]
|
||||
|
||||
- func: amin.out(Tensor self, int[1] dim=[], bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
structured: True
|
||||
dispatch:
|
||||
CPU, CUDA, MTIA: amin_out
|
||||
MPS: amin_out_mps
|
||||
tags: reduction
|
||||
|
||||
# TODO: Add this function to MPS dispatch key so that we avoid declaring it in
|
||||
# native_functions.yaml
|
||||
@ -5860,6 +5892,7 @@
|
||||
SparseCPU, SparseCUDA, SparseMPS, SparseMeta: sum_coo
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_csr
|
||||
autogen: sum.out
|
||||
tags: reduction
|
||||
|
||||
- func: sum.dim_IntList(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
|
||||
# TODO: Align the signature of sum.dim_IntList and _sparse_csr_sum.dim_dtype
|
||||
@ -5870,11 +5903,12 @@
|
||||
NestedTensorCPU: NestedTensor_sum_dim_CPU
|
||||
SparseCPU, SparseCUDA, SparseMPS: sum_sparse_coo
|
||||
SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_sparse_compressed
|
||||
tags: core
|
||||
tags: [core, reduction]
|
||||
|
||||
- func: sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: reduction
|
||||
|
||||
- func: sum.IntList_out(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
|
||||
structured: True
|
||||
@ -5882,9 +5916,11 @@
|
||||
dispatch:
|
||||
CPU, CUDA: sum_out
|
||||
MPS: sum_out_mps
|
||||
tags: reduction
|
||||
|
||||
- func: sum.DimnameList_out(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
tags: reduction
|
||||
|
||||
# TODO: this function will be replaced once nested expand semantics have been settled on
|
||||
- func: _nested_sum_backward(Tensor grad, Tensor self, int[1]? dim, bool keepdim=False) -> Tensor
|
||||
@ -5896,11 +5932,13 @@
|
||||
dispatch:
|
||||
CPU, CUDA: nansum
|
||||
MPS: nansum_mps
|
||||
tags: reduction
|
||||
|
||||
- func: nansum.out(Tensor self, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
|
||||
dispatch:
|
||||
CPU, CUDA: nansum_out
|
||||
MPS: nansum_out_mps
|
||||
tags: reduction
|
||||
|
||||
- func: hash_tensor(Tensor self, int[1] dim=[], *, bool keepdim=False, int mode=0) -> Tensor
|
||||
variants: function, method
|
||||
@ -5964,11 +6002,13 @@
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
tags: reduction
|
||||
|
||||
- func: std.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
tags: reduction
|
||||
|
||||
- func: std.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
@ -5977,16 +6017,19 @@
|
||||
CPU, CUDA: std
|
||||
MPS: std_mps
|
||||
QuantizedCPU: std_quantized_cpu
|
||||
tags: reduction
|
||||
|
||||
- func: std_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
tags: reduction
|
||||
|
||||
- func: std_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
tags: reduction
|
||||
|
||||
- func: std_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
|
||||
device_check: NoCheck # TensorIterator
|
||||
@ -5995,42 +6038,51 @@
|
||||
CPU, CUDA: std_mean
|
||||
MPS: std_mean_mps
|
||||
autogen: std_mean.correction_out
|
||||
tags: reduction
|
||||
|
||||
- func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
tags: reduction
|
||||
|
||||
- func: std_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function
|
||||
tags: reduction
|
||||
|
||||
- func: std.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
tags: reduction
|
||||
|
||||
- func: std.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
dispatch:
|
||||
CPU, CUDA: std_out
|
||||
QuantizedCPU: std_out_quantized_cpu
|
||||
tags: reduction
|
||||
|
||||
- func: std.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
tags: reduction
|
||||
|
||||
- func: std.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
tags: reduction
|
||||
|
||||
- func: std.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: reduction
|
||||
|
||||
- func: std.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function
|
||||
tags: reduction
|
||||
|
||||
- func: prod(Tensor self, *, ScalarType? dtype=None) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
@ -6039,13 +6091,13 @@
|
||||
CPU, CUDA: prod
|
||||
MPS: prod_mps
|
||||
autogen: prod.out
|
||||
tags: core
|
||||
tags: [core, reduction]
|
||||
|
||||
- func: prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
|
||||
structured_delegate: prod.int_out
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: core
|
||||
tags: [core, reduction]
|
||||
|
||||
- func: prod.int_out(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
|
||||
structured: True
|
||||
@ -6053,13 +6105,16 @@
|
||||
dispatch:
|
||||
CPU, CUDA: prod_out
|
||||
MPS: prod_out_mps
|
||||
tags: reduction
|
||||
|
||||
- func: prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: reduction
|
||||
|
||||
- func: prod.Dimname_out(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
tags: reduction
|
||||
|
||||
- func: t(Tensor(a) self) -> Tensor(a)
|
||||
device_check: NoCheck
|
||||
@ -6520,11 +6575,12 @@
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
tags: reduction
|
||||
|
||||
- func: var.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: core
|
||||
tags: [core, reduction]
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
|
||||
- func: var.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> Tensor
|
||||
@ -6534,43 +6590,51 @@
|
||||
CPU, CUDA: var
|
||||
MPS: var_mps
|
||||
MTIA: var_mtia
|
||||
tags: core
|
||||
tags: [core, reduction]
|
||||
|
||||
- func: var.out(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
tags: reduction
|
||||
|
||||
- func: var.correction_out(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
dispatch:
|
||||
CPU, CUDA: var_out
|
||||
tags: reduction
|
||||
|
||||
- func: var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
tags: reduction
|
||||
|
||||
- func: var.names_out(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
tags: reduction
|
||||
|
||||
- func: var.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: reduction
|
||||
|
||||
- func: var.correction_names_out(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function
|
||||
tags: reduction
|
||||
|
||||
- func: var_mean(Tensor self, bool unbiased=True) -> (Tensor, Tensor)
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
tags: reduction
|
||||
|
||||
- func: var_mean.dim(Tensor self, int[1]? dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
tags: reduction
|
||||
|
||||
- func: var_mean.correction(Tensor self, int[1]? dim=None, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
|
||||
device_check: NoCheck # TensorIterator
|
||||
@ -6579,15 +6643,18 @@
|
||||
CPU, CUDA: var_mean
|
||||
MPS: var_mean_mps
|
||||
autogen: var_mean.correction_out
|
||||
tags: reduction
|
||||
|
||||
- func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor)
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function
|
||||
cpp_no_default_args: ["unbiased"]
|
||||
tags: reduction
|
||||
|
||||
- func: var_mean.correction_names(Tensor self, Dimname[1] dim, *, Scalar? correction=None, bool keepdim=False) -> (Tensor, Tensor)
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function
|
||||
tags: reduction
|
||||
|
||||
- func: view_as(Tensor(a) self, Tensor other) -> Tensor(a)
|
||||
variants: method
|
||||
@ -6847,6 +6914,7 @@
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: norm
|
||||
autogen: norm.ScalarOpt_dtype_out
|
||||
tags: reduction
|
||||
|
||||
- func: norm.Scalar(Tensor self, Scalar p=2) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
@ -6854,6 +6922,7 @@
|
||||
dispatch:
|
||||
CompositeExplicitAutograd: norm
|
||||
autogen: norm.Scalar_out
|
||||
tags: reduction
|
||||
|
||||
- func: norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
|
||||
structured_delegate: norm.dtype_out
|
||||
@ -6861,6 +6930,7 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: sparse_dtype_norm
|
||||
tags: reduction
|
||||
|
||||
- func: norm.ScalarOpt_dim(Tensor self, Scalar? p, int[1] dim, bool keepdim=False) -> Tensor
|
||||
structured_delegate: norm.out
|
||||
@ -6868,6 +6938,7 @@
|
||||
variants: function, method
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: sparse_norm
|
||||
tags: reduction
|
||||
|
||||
- func: norm.dtype_out(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
|
||||
structured: True
|
||||
@ -6875,6 +6946,7 @@
|
||||
dispatch:
|
||||
CPU, CUDA: norm_dtype_out
|
||||
MPS: norm_dtype_out_mps
|
||||
tags: reduction
|
||||
|
||||
- func: norm.out(Tensor self, Scalar? p, int[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
structured: True
|
||||
@ -6882,21 +6954,26 @@
|
||||
dispatch:
|
||||
CPU, CUDA: norm_out
|
||||
MPS: norm_out_mps
|
||||
tags: reduction
|
||||
|
||||
# These four redispatch in their implementation, so OK to be CompositeImplicitAutograd
|
||||
- func: norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: reduction
|
||||
|
||||
- func: norm.names_ScalarOpt_dim(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
variants: function, method
|
||||
tags: reduction
|
||||
|
||||
- func: norm.names_dtype_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
tags: reduction
|
||||
|
||||
- func: norm.names_out(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
tags: reduction
|
||||
|
||||
- func: frexp.Tensor(Tensor self) -> (Tensor mantissa, Tensor exponent)
|
||||
variants: method, function
|
||||
@ -10082,12 +10159,14 @@
|
||||
CPU, CUDA: min
|
||||
MPS: min_mps
|
||||
QuantizedCPU: min_quantized_cpu
|
||||
tags: [reduction]
|
||||
|
||||
- func: min.unary_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
dispatch:
|
||||
CPU, CUDA: min_unary_out
|
||||
QuantizedCPU: min_quantized_unary_out
|
||||
tags: [reduction]
|
||||
|
||||
- func: fmin(Tensor self, Tensor other) -> Tensor
|
||||
structured_delegate: fmin.out
|
||||
@ -10110,6 +10189,7 @@
|
||||
CPU, CUDA: max
|
||||
MPS: max_mps
|
||||
QuantizedCPU: max_quantized_cpu
|
||||
tags: [reduction]
|
||||
|
||||
- func: fmax(Tensor self, Tensor other) -> Tensor
|
||||
structured_delegate: fmax.out
|
||||
@ -10156,6 +10236,7 @@
|
||||
dispatch:
|
||||
CPU, CUDA: max_unary_out
|
||||
QuantizedCPU: max_quantized_unary_out
|
||||
tags: [reduction]
|
||||
|
||||
- func: minimum(Tensor self, Tensor other) -> Tensor
|
||||
structured_delegate: minimum.out
|
||||
@ -10275,6 +10356,7 @@
|
||||
device_check: NoCheck # TensorIterator
|
||||
structured_delegate: all.all_out
|
||||
variants: method, function
|
||||
tags: reduction
|
||||
|
||||
- func: all.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck
|
||||
@ -10283,6 +10365,7 @@
|
||||
CPU, CUDA: all_all_out
|
||||
MTIA: all_all_out_mtia
|
||||
MPS: all_all_out_mps
|
||||
tags: reduction
|
||||
|
||||
- func: any(Tensor self) -> Tensor
|
||||
device_check: NoCheck # TensorIterator
|
||||
@ -10290,7 +10373,7 @@
|
||||
variants: method, function
|
||||
dispatch:
|
||||
SparseCPU, SparseCUDA, SparseMPS: any_sparse
|
||||
tags: core
|
||||
tags: [core, reduction]
|
||||
|
||||
- func: any.all_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck
|
||||
@ -10298,6 +10381,7 @@
|
||||
dispatch:
|
||||
CPU, CUDA: any_all_out
|
||||
MPS: any_all_out_mps
|
||||
tags: reduction
|
||||
|
||||
- func: renorm.out(Tensor self, Scalar p, int dim, Scalar maxnorm, *, Tensor(a!) out) -> Tensor(a!)
|
||||
device_check: NoCheck # TensorIterator
|
||||
@ -14073,16 +14157,10 @@
|
||||
- func: linalg_lu_factor(Tensor A, *, bool pivot=True) -> (Tensor LU, Tensor pivots)
|
||||
python_module: linalg
|
||||
variants: function
|
||||
dispatch:
|
||||
CompositeImplicitAutograd: linalg_lu_factor
|
||||
MPS: linalg_lu_factor_mps
|
||||
|
||||
- func: linalg_lu_factor.out(Tensor A, *, bool pivot=True, Tensor(a!) LU, Tensor(b!) pivots) -> (Tensor(a!) LU, Tensor(b!) pivots)
|
||||
python_module: linalg
|
||||
variants: function
|
||||
dispatch:
|
||||
CompositeImplicitAutograd: linalg_lu_factor_out
|
||||
MPS: linalg_lu_factor_out_mps
|
||||
|
||||
- func: linalg_lu_factor_ex(Tensor A, *, bool pivot=True, bool check_errors=False) -> (Tensor LU, Tensor pivots, Tensor info)
|
||||
python_module: linalg
|
||||
@ -14349,6 +14427,7 @@
|
||||
python_module: linalg
|
||||
variants: function
|
||||
structured_delegate: linalg_vector_norm.out
|
||||
tags: reduction
|
||||
|
||||
- func: linalg_vector_norm.out(Tensor self, Scalar ord=2, int[1]? dim=None, bool keepdim=False, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)
|
||||
python_module: linalg
|
||||
@ -14356,6 +14435,7 @@
|
||||
dispatch:
|
||||
CPU, CUDA: linalg_vector_norm_out
|
||||
MPS: linalg_vector_norm_out_mps
|
||||
tags: reduction
|
||||
|
||||
- func: linalg_matrix_norm(Tensor self, Scalar ord, int[] dim=[-2,-1], bool keepdim=False, *, ScalarType? dtype=None) -> Tensor
|
||||
python_module: linalg
|
||||
|
||||
@ -40,15 +40,7 @@
|
||||
#include <thrust/iterator/discard_iterator.h>
|
||||
|
||||
|
||||
#if defined(__CUDACC__) && (defined(CUSPARSE_VERSION) || (defined(USE_ROCM) && ROCM_VERSION >= 60300))
|
||||
#define IS_CUSPARSE11_AVAILABLE() 1
|
||||
#else
|
||||
#define IS_CUSPARSE11_AVAILABLE() 0
|
||||
#endif
|
||||
|
||||
#if IS_CUSPARSE11_AVAILABLE()
|
||||
#include <library_types.h>
|
||||
#endif
|
||||
|
||||
namespace at::native {
|
||||
|
||||
@ -103,17 +95,9 @@ struct csrMatrixRef {
|
||||
int nnz_{0};
|
||||
std::vector<int> size_{};
|
||||
|
||||
#if IS_CUSPARSE11_AVAILABLE()
|
||||
cusparseSpMatDescr_t description_{0};
|
||||
#else
|
||||
cusparseMatDescr_t description_{0};
|
||||
#endif
|
||||
cusparseSpMatDescr_t description_{0};
|
||||
|
||||
csrMatrixRef() {
|
||||
#if !IS_CUSPARSE11_AVAILABLE()
|
||||
create_general_description_(description_);
|
||||
#endif
|
||||
}
|
||||
csrMatrixRef() = default;
|
||||
|
||||
csrMatrixRef(
|
||||
int* csr_indices,
|
||||
@ -126,7 +110,6 @@ struct csrMatrixRef {
|
||||
csr_values_{csr_values},
|
||||
nnz_{nnz},
|
||||
size_{size} {
|
||||
#if IS_CUSPARSE11_AVAILABLE()
|
||||
cudaDataType cuda_data_type = at::cuda::getCudaDataType<scalar_t>();
|
||||
TORCH_CUDASPARSE_CHECK(cusparseCreateCsr(
|
||||
&description_,
|
||||
@ -140,17 +123,10 @@ struct csrMatrixRef {
|
||||
CUSPARSE_INDEX_32I,
|
||||
CUSPARSE_INDEX_BASE_ZERO,
|
||||
cuda_data_type));
|
||||
#else
|
||||
create_general_description_(description_);
|
||||
#endif
|
||||
}
|
||||
|
||||
~csrMatrixRef() {
|
||||
#if IS_CUSPARSE11_AVAILABLE()
|
||||
cusparseDestroySpMat(description_);
|
||||
#else
|
||||
cusparseDestroyMatDescr(description_);
|
||||
#endif
|
||||
cusparseDestroySpMat(description_);
|
||||
}
|
||||
|
||||
int size(int index) const {
|
||||
@ -196,8 +172,6 @@ struct csrOutput {
|
||||
}
|
||||
};
|
||||
|
||||
#if IS_CUSPARSE11_AVAILABLE()
|
||||
|
||||
// RAII guard helps to support cuSparse 11 API for `A @ B` operation
|
||||
// This generic template exists because with cuSparse the `scalar_t` type could be a double or float
|
||||
template <class scalar_t>
|
||||
@ -396,284 +370,6 @@ template struct CusparseMatrixMultiplyOp<float>;
|
||||
|
||||
template struct CusparseMatrixMultiplyOp<double>;
|
||||
|
||||
#else // if not IS_CUSPARSE11_AVAILABLE()
|
||||
|
||||
using DcsrMatrixRef = csrMatrixRef<double>;
|
||||
using ScsrMatrixRef = csrMatrixRef<float>;
|
||||
|
||||
// RAII guard helps to support cuSparse 10 API for `A @ B` operation
|
||||
// This generic template exists because with cuSparse the `scalar_t` type could be a double or float
|
||||
template <class scalar_t>
|
||||
struct CusparseMatrixMultiplyOp {
|
||||
csrOutput operator()(
|
||||
const csrMatrixRef<scalar_t>& lhs,
|
||||
const csrMatrixRef<scalar_t>& rhs,
|
||||
Tensor &output_values,
|
||||
Tensor &output_indices)
|
||||
{
|
||||
static_assert(false&&sizeof(scalar_t), "cusparse csr sparse-sparse MM only supports data type of float and double.");
|
||||
}
|
||||
};
|
||||
|
||||
// Specializacion for `A @ B` operation for double values with cuSparse
|
||||
template<> struct CusparseMatrixMultiplyOp<double> {
|
||||
csrgemm2Info_t gemm2Info_;
|
||||
|
||||
CusparseMatrixMultiplyOp() {
|
||||
TORCH_CUDASPARSE_CHECK(cusparseCreateCsrgemm2Info(&gemm2Info_));
|
||||
}
|
||||
~CusparseMatrixMultiplyOp() {
|
||||
cusparseDestroyCsrgemm2Info(gemm2Info_);
|
||||
}
|
||||
|
||||
csrOutput operator ()(
|
||||
const DcsrMatrixRef& lhs,
|
||||
const DcsrMatrixRef& rhs,
|
||||
Tensor &output_values,
|
||||
Tensor &output_indices) {
|
||||
double alpha = 1.0;
|
||||
DcsrMatrixRef empty;
|
||||
return Dgemm2(lhs, rhs, empty, &alpha, nullptr, output_values, output_indices);
|
||||
}
|
||||
|
||||
csrOutput Dgemm2(
|
||||
const DcsrMatrixRef& A,
|
||||
const DcsrMatrixRef& B,
|
||||
const DcsrMatrixRef& C,
|
||||
const double* alpha,
|
||||
const double* beta,
|
||||
Tensor &output_values,
|
||||
Tensor &output_indices) {
|
||||
void* buffer_{nullptr};
|
||||
cusparseHandle_t cusparseHandle_ = at::cuda::getCurrentCUDASparseHandle();
|
||||
TORCH_CUDASPARSE_CHECK(cusparseSetPointerMode(cusparseHandle_, CUSPARSE_POINTER_MODE_HOST));
|
||||
|
||||
csrOutput out({A.size(0), B.size(1)});
|
||||
int innerSize = confirm_mult_size(A.size_, B.size_);
|
||||
out.csr_pointers_ = at::empty({out.size(0) + 1}, output_indices.options().dtype(kInt));
|
||||
|
||||
// Compute needed buffer size
|
||||
size_t new_bubber_sz;
|
||||
TORCH_CUDASPARSE_CHECK(cusparseDcsrgemm2_bufferSizeExt(
|
||||
cusparseHandle_,
|
||||
out.size(0),
|
||||
out.size(1),
|
||||
innerSize,
|
||||
alpha,
|
||||
A.description_,
|
||||
A.nnz_,
|
||||
A.csr_pointers_,
|
||||
A.csr_indices_,
|
||||
B.description_,
|
||||
B.nnz_,
|
||||
B.csr_pointers_,
|
||||
B.csr_indices_,
|
||||
beta,
|
||||
C.description_,
|
||||
C.nnz_,
|
||||
C.csr_pointers_,
|
||||
C.csr_indices_,
|
||||
gemm2Info_,
|
||||
&new_bubber_sz));
|
||||
|
||||
// (Re)allocate buffer if needed
|
||||
auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
|
||||
at::DataPtr data_ptr = allocator.allocate(new_bubber_sz);
|
||||
buffer_ = data_ptr.get();
|
||||
|
||||
// Find the resulting non-zero pattern.
|
||||
TORCH_CUDASPARSE_CHECK(cusparseXcsrgemm2Nnz(
|
||||
cusparseHandle_,
|
||||
out.size(0),
|
||||
out.size(1),
|
||||
innerSize,
|
||||
A.description_,
|
||||
A.nnz_,
|
||||
A.csr_pointers_,
|
||||
A.csr_indices_,
|
||||
B.description_,
|
||||
B.nnz_,
|
||||
B.csr_pointers_,
|
||||
B.csr_indices_,
|
||||
C.description_,
|
||||
C.nnz_,
|
||||
C.csr_pointers_,
|
||||
C.csr_indices_,
|
||||
out.description_,
|
||||
out.csr_pointers_.data_ptr<int>(),
|
||||
&out.nnz_,
|
||||
gemm2Info_,
|
||||
buffer_));
|
||||
|
||||
out.csr_indices_ = at::empty({out.nnz_}, output_indices.options().dtype(kInt));
|
||||
out.csr_values_ = at::empty({out.nnz_}, output_values.options());
|
||||
|
||||
// Perform the gemm2 operation for doubles
|
||||
// out = alpha ∗ A ∗ B + beta ∗ C
|
||||
TORCH_CUDASPARSE_CHECK(cusparseDcsrgemm2(
|
||||
cusparseHandle_,
|
||||
out.size(0),
|
||||
out.size(1),
|
||||
innerSize,
|
||||
alpha,
|
||||
A.description_,
|
||||
A.nnz_,
|
||||
A.csr_values_,
|
||||
A.csr_pointers_,
|
||||
A.csr_indices_,
|
||||
B.description_,
|
||||
B.nnz_,
|
||||
B.csr_values_,
|
||||
B.csr_pointers_,
|
||||
B.csr_indices_,
|
||||
beta,
|
||||
C.description_,
|
||||
C.nnz_,
|
||||
C.csr_values_,
|
||||
C.csr_pointers_,
|
||||
C.csr_indices_,
|
||||
out.description_,
|
||||
out.csr_values_.data_ptr<double>(),
|
||||
out.csr_pointers_.data_ptr<int>(),
|
||||
out.csr_indices_.data_ptr<int>(),
|
||||
gemm2Info_,
|
||||
buffer_));
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
// Specializacion for `A @ B` operation for float values with cuSparse
|
||||
template<> struct CusparseMatrixMultiplyOp<float> {
|
||||
csrgemm2Info_t gemm2Info_;
|
||||
|
||||
CusparseMatrixMultiplyOp() {
|
||||
TORCH_CUDASPARSE_CHECK(cusparseCreateCsrgemm2Info(&gemm2Info_));
|
||||
|
||||
}
|
||||
~CusparseMatrixMultiplyOp() {
|
||||
cusparseDestroyCsrgemm2Info(gemm2Info_);
|
||||
}
|
||||
csrOutput operator()(
|
||||
const ScsrMatrixRef& lhs,
|
||||
const ScsrMatrixRef& rhs,
|
||||
Tensor &output_values,
|
||||
Tensor &output_indices) {
|
||||
float alpha = 1.0;
|
||||
ScsrMatrixRef empty;
|
||||
return Sgemm2(lhs, rhs, empty, &alpha, nullptr, output_values, output_indices);
|
||||
}
|
||||
|
||||
csrOutput Sgemm2(
|
||||
const ScsrMatrixRef& A,
|
||||
const ScsrMatrixRef& B,
|
||||
const ScsrMatrixRef& C,
|
||||
const float* alpha,
|
||||
const float* beta,
|
||||
Tensor &output_values,
|
||||
Tensor &output_indices) {
|
||||
void* buffer_{nullptr};
|
||||
cusparseHandle_t cusparseHandle_ = at::cuda::getCurrentCUDASparseHandle();
|
||||
TORCH_CUDASPARSE_CHECK(cusparseSetPointerMode(cusparseHandle_, CUSPARSE_POINTER_MODE_HOST));
|
||||
|
||||
csrOutput out({A.size(0), B.size(1)});
|
||||
|
||||
int innerSize = confirm_mult_size(A.size_, B.size_);
|
||||
|
||||
out.csr_pointers_ = at::empty({out.size(0) + 1}, output_indices.options().dtype(kInt));
|
||||
|
||||
// Compute needed buffer size
|
||||
size_t new_bubber_sz;
|
||||
TORCH_CUDASPARSE_CHECK(cusparseScsrgemm2_bufferSizeExt(
|
||||
cusparseHandle_,
|
||||
out.size(0),
|
||||
out.size(1),
|
||||
innerSize,
|
||||
alpha,
|
||||
A.description_,
|
||||
A.nnz_,
|
||||
A.csr_pointers_,
|
||||
A.csr_indices_,
|
||||
B.description_,
|
||||
B.nnz_,
|
||||
B.csr_pointers_,
|
||||
B.csr_indices_,
|
||||
beta,
|
||||
C.description_,
|
||||
C.nnz_,
|
||||
C.csr_pointers_,
|
||||
C.csr_indices_,
|
||||
gemm2Info_,
|
||||
&new_bubber_sz));
|
||||
|
||||
auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
|
||||
at::DataPtr data_ptr = allocator.allocate(new_bubber_sz);
|
||||
buffer_ = data_ptr.get();
|
||||
|
||||
// Find the resulting non-zero pattern.
|
||||
TORCH_CUDASPARSE_CHECK(cusparseXcsrgemm2Nnz(
|
||||
cusparseHandle_,
|
||||
out.size(0),
|
||||
out.size(1),
|
||||
innerSize,
|
||||
A.description_,
|
||||
A.nnz_,
|
||||
A.csr_pointers_,
|
||||
A.csr_indices_,
|
||||
B.description_,
|
||||
B.nnz_,
|
||||
B.csr_pointers_,
|
||||
B.csr_indices_,
|
||||
C.description_,
|
||||
C.nnz_,
|
||||
C.csr_pointers_,
|
||||
C.csr_indices_,
|
||||
out.description_,
|
||||
out.csr_pointers_.data_ptr<int>(),
|
||||
&out.nnz_,
|
||||
gemm2Info_,
|
||||
buffer_));
|
||||
|
||||
out.csr_indices_ = at::empty({out.nnz_}, output_indices.options().dtype(kInt));
|
||||
out.csr_values_ = at::empty({out.nnz_}, output_values.options());
|
||||
|
||||
// Perform the gemm2 operation for doubles
|
||||
// out = alpha ∗ A ∗ B + beta ∗ C
|
||||
TORCH_CUDASPARSE_CHECK(cusparseScsrgemm2(
|
||||
cusparseHandle_,
|
||||
out.size(0),
|
||||
out.size(1),
|
||||
innerSize,
|
||||
alpha,
|
||||
A.description_,
|
||||
A.nnz_,
|
||||
A.csr_values_,
|
||||
A.csr_pointers_,
|
||||
A.csr_indices_,
|
||||
B.description_,
|
||||
B.nnz_,
|
||||
B.csr_values_,
|
||||
B.csr_pointers_,
|
||||
B.csr_indices_,
|
||||
beta,
|
||||
C.description_,
|
||||
C.nnz_,
|
||||
C.csr_values_,
|
||||
C.csr_pointers_,
|
||||
C.csr_indices_,
|
||||
out.description_,
|
||||
out.csr_values_.data_ptr<float>(),
|
||||
out.csr_pointers_.data_ptr<int>(),
|
||||
out.csr_indices_.data_ptr<int>(),
|
||||
gemm2Info_,
|
||||
buffer_));
|
||||
return out;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
#endif // IS_CUSPARSE11_AVAILABLE()
|
||||
|
||||
template <typename scalar_t>
|
||||
void sparse_sparse_matmul_cuda_kernel(
|
||||
Tensor& result,
|
||||
@ -815,19 +511,15 @@ Tensor sparse_sparse_matmul_cuda(const Tensor& mat1_, const Tensor& mat2_) {
|
||||
auto output = at::native::empty_like(mat1_);
|
||||
output.sparse_resize_and_clear_({mat1_.size(0), mat2_.size(1)}, mat1_.sparse_dim(), 0);
|
||||
|
||||
#if IS_CUSPARSE11_AVAILABLE() && !defined(USE_ROCM)
|
||||
#if !defined(USE_ROCM)
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, mat1_.scalar_type(), "sparse_matmul", [&] {
|
||||
sparse_sparse_matmul_cuda_kernel<scalar_t>(output, mat1_.coalesce(), mat2_.coalesce());
|
||||
});
|
||||
#elif IS_CUSPARSE11_AVAILABLE() && defined(USE_ROCM)
|
||||
#else
|
||||
// ROCm does not support half and bfloat16 types for sparse_matmul
|
||||
AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(mat1_.scalar_type(), "sparse_matmul", [&] {
|
||||
sparse_sparse_matmul_cuda_kernel<scalar_t>(output, mat1_.coalesce(), mat2_.coalesce());
|
||||
});
|
||||
#else
|
||||
AT_DISPATCH_FLOATING_TYPES(mat1_.scalar_type(), "sparse_matmul", [&] {
|
||||
sparse_sparse_matmul_cuda_kernel<scalar_t>(output, mat1_.coalesce(), mat2_.coalesce());
|
||||
});
|
||||
#endif
|
||||
return output;
|
||||
}
|
||||
|
||||
@ -62,7 +62,6 @@ kernel void build_row_ptr_from_sorted_rows_by_batch(
|
||||
|
||||
template <typename T>
|
||||
kernel void spmm_bmm_coo_rows_grouped(
|
||||
device const long* rows [[buffer(0)]],
|
||||
device const long* cols [[buffer(1)]],
|
||||
device const T* vals [[buffer(2)]],
|
||||
device const T* dense [[buffer(3)]],
|
||||
@ -73,7 +72,6 @@ kernel void spmm_bmm_coo_rows_grouped(
|
||||
uint3 ltid [[thread_position_in_threadgroup]],
|
||||
uint3 tptg [[threads_per_threadgroup]])
|
||||
{
|
||||
const uint B = dims.x;
|
||||
const uint I = dims.y;
|
||||
const uint J = dims.z;
|
||||
const uint K = dims.w;
|
||||
@ -321,7 +319,6 @@ INSTANTIATE_FOR_FLOAT_TYPES(INSTANTIATE_FUSED_GATHER_MUL);
|
||||
#define INSTANTIATE_SPMM_BMM_COO_ROWS_GROUPED(DTYPE) \
|
||||
template [[host_name("spmm_bmm_coo_rows_grouped_" #DTYPE)]] kernel void \
|
||||
spmm_bmm_coo_rows_grouped<DTYPE>( \
|
||||
device const long* rows [[buffer(0)]], \
|
||||
device const long* cols [[buffer(1)]], \
|
||||
device const DTYPE* vals [[buffer(2)]], \
|
||||
device const DTYPE* dense [[buffer(3)]], \
|
||||
|
||||
@ -93,3 +93,7 @@
|
||||
This operator does not support cudagraphs. The presence of this tag on an operator will cause
|
||||
Inductor to split the graph around this operator. Note that operators without this tag may still
|
||||
not support CUDAGraphs. Inductor may have other hardcoded lists around that.
|
||||
- tag: reduction
|
||||
desc: |
|
||||
This tag indicates that an operator performs a reduction operation, computing aggregate values
|
||||
(sum, mean, max, min, etc.) across one or more dimensions of the input tensor(s).
|
||||
|
||||
@ -202,7 +202,6 @@ supported:
|
||||
- select_backward
|
||||
- _trilinear
|
||||
- linalg_pinv.atol_rtol_tensor
|
||||
- svd
|
||||
- logsumexp.out
|
||||
symint:
|
||||
- empty.memory_format
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
add_loop_eager,compile_time_instruction_count,3070000000,0.1
|
||||
add_loop_eager,compile_time_instruction_count,3184000000,0.1
|
||||
|
||||
|
||||
|
||||
add_loop_eager_dynamic,compile_time_instruction_count,4432000000,0.1
|
||||
add_loop_eager_dynamic,compile_time_instruction_count,4595000000,0.1
|
||||
|
||||
|
||||
|
||||
@ -18,7 +18,7 @@ add_loop_inductor_gpu,compile_time_instruction_count,26800000000,0.1
|
||||
|
||||
|
||||
|
||||
basic_modules_ListOfLinears_eager,compile_time_instruction_count,1048000000,0.1
|
||||
basic_modules_ListOfLinears_eager,compile_time_instruction_count,1096000000,0.1
|
||||
|
||||
|
||||
|
||||
@ -26,7 +26,7 @@ basic_modules_ListOfLinears_inductor,compile_time_instruction_count,15240000000,
|
||||
|
||||
|
||||
|
||||
basic_modules_ListOfLinears_inductor_gpu_force_shape_pad,compile_time_instruction_count,17020000000,0.1
|
||||
basic_modules_ListOfLinears_inductor_gpu_force_shape_pad,compile_time_instruction_count,17720000000,0.1
|
||||
|
||||
|
||||
|
||||
@ -34,11 +34,11 @@ basic_modules_ListOfLinears_inductor_gpu,compile_time_instruction_count,11090000
|
||||
|
||||
|
||||
|
||||
update_hint_regression,compile_time_instruction_count,1719000000,0.1
|
||||
update_hint_regression,compile_time_instruction_count,1645000000,0.1
|
||||
|
||||
|
||||
|
||||
sum_floordiv_regression,compile_time_instruction_count,3686995725,0.1
|
||||
sum_floordiv_regression,compile_time_instruction_count,3813000000,0.1
|
||||
|
||||
|
||||
|
||||
@ -50,31 +50,31 @@ symint_sum_loop,compile_time_instruction_count,4299000000,0.1
|
||||
|
||||
|
||||
|
||||
aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,1869000000,0.1
|
||||
aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,1793000000,0.1
|
||||
|
||||
|
||||
|
||||
aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,5281000000,0.1
|
||||
aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,5120000000,0.1
|
||||
|
||||
|
||||
|
||||
aotdispatcher_partitioner_cpu,compile_time_instruction_count,8333000000,0.1
|
||||
aotdispatcher_partitioner_cpu,compile_time_instruction_count,7936000000,0.1
|
||||
|
||||
|
||||
|
||||
aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1909000000,0.1
|
||||
aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1848000000,0.1
|
||||
|
||||
|
||||
|
||||
aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3442000000,0.1
|
||||
aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3152000000,0.1
|
||||
|
||||
|
||||
|
||||
aotdispatcher_training_subclass_cpu,compile_time_instruction_count,9239000000,0.1
|
||||
aotdispatcher_training_subclass_cpu,compile_time_instruction_count,8301000000,0.1
|
||||
|
||||
|
||||
|
||||
mm_loop_inductor_gpu,compile_time_instruction_count,4820968837,0.1
|
||||
mm_loop_inductor_gpu,compile_time_instruction_count,4958000000,0.1
|
||||
|
||||
|
||||
|
||||
@ -82,8 +82,8 @@ mm_loop_inductor_dynamic_gpu,compile_time_instruction_count,9051000000,0.1
|
||||
|
||||
|
||||
|
||||
basic_NestedModule_eager,compile_time_instruction_count,9554000000,0.1
|
||||
basic_NestedModule_eager,compile_time_instruction_count,9990000000,0.1
|
||||
|
||||
|
||||
|
||||
basic_InlineMod_eager,compile_time_instruction_count,7618000000,0.1
|
||||
basic_InlineMod_eager,compile_time_instruction_count,8126000000,0.1
|
||||
|
||||
|
@ -43,6 +43,7 @@ tolerance:
|
||||
- doctr_reco_predictor
|
||||
- drq
|
||||
- phlippe_resnet
|
||||
- pytorch_CycleGAN_and_pix2pix
|
||||
|
||||
higher_bf16:
|
||||
- doctr_reco_predictor
|
||||
|
||||
@ -127,7 +127,7 @@ def trainbench(
|
||||
bwd_time = bwd_start_event.elapsed_time(bwd_end_event)
|
||||
return fwd_time, bwd_time
|
||||
|
||||
creator_args = creator_args = {
|
||||
creator_args = {
|
||||
"seqLength": seqLength,
|
||||
"numLayers": numLayers,
|
||||
"inputSize": inputSize,
|
||||
|
||||
@ -12,7 +12,7 @@ def modeldef(request, net_name, executor, fuser):
|
||||
|
||||
# Given a 'net_name' provided by generate_tests, build the thing
|
||||
name, rnn_creator, context = get_nn_runners(net_name)[0]
|
||||
creator_args = creator_args = {
|
||||
creator_args = {
|
||||
"seqLength": 100,
|
||||
"numLayers": 1,
|
||||
"inputSize": 512,
|
||||
|
||||
@ -48,17 +48,89 @@ PyTorch,sub,"sub_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.float32",short,Fa
|
||||
PyTorch,div,"div_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.float32",short,False,58.529255,0.000000
|
||||
PyTorch,mul,"mul_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.float32",short,False,54.645077,0.000000
|
||||
PyTorch,add,add_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,4.397014,0.000000
|
||||
PyTorch,add,add_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,7.739000,0.000000
|
||||
PyTorch,add,add_M1_N1_K1_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,7.786000,0.000000
|
||||
PyTorch,add,add_M1_N1_K1_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,1.911000,0.000000
|
||||
PyTorch,add,add_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,59.243500,0.000000
|
||||
PyTorch,add,add_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,105.066000,0.000000
|
||||
PyTorch,add,add_M64_N64_K64_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,106.076000,0.000000
|
||||
PyTorch,add,add_M64_N64_K64_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,47.225000,0.000000
|
||||
PyTorch,add,add_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,57.947691,0.000000
|
||||
PyTorch,add,add_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,107.291000,0.000000
|
||||
PyTorch,add,add_M64_N64_K128_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,107.224000,0.000000
|
||||
PyTorch,add,add_M64_N64_K128_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,47.912000,0.000000
|
||||
PyTorch,sub,sub_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,1.925851,0.000000
|
||||
PyTorch,sub,sub_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,8.0240000,0.000000
|
||||
PyTorch,sub,sub_M1_N1_K1_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,8.069000,0.000000
|
||||
PyTorch,sub,sub_M1_N1_K1_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,1.938000,0.000000
|
||||
PyTorch,sub,sub_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,57.308320,0.000000
|
||||
PyTorch,sub,sub_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,107.091000,0.000000
|
||||
PyTorch,sub,sub_M64_N64_K64_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,108.710000,0.000000
|
||||
PyTorch,sub,sub_M64_N64_K64_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,47.502000,0.000000
|
||||
PyTorch,sub,sub_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,57.787743,0.000000
|
||||
PyTorch,sub,sub_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,108.863000,0.000000
|
||||
PyTorch,sub,sub_M64_N64_K128_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,108.939000,0.000000
|
||||
PyTorch,sub,sub_M64_N64_K128_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,47.603000,0.000000
|
||||
PyTorch,div,div_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,7.978539,0.000000
|
||||
PyTorch,div,div_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,8.741000,0.000000
|
||||
PyTorch,div,div_M1_N1_K1_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,8.757000,0.000000
|
||||
PyTorch,div,div_M1_N1_K1_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,8.774000,0.000000
|
||||
PyTorch,div,div_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,159.754860,0.000000
|
||||
PyTorch,div,div_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,165.552000,0.000000
|
||||
PyTorch,div,div_M64_N64_K64_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,165.755000,0.000000
|
||||
PyTorch,div,div_M64_N64_K64_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,165.714000,0.000000
|
||||
PyTorch,div,div_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,165.360235,0.000000
|
||||
PyTorch,div,div_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,168.376000,0.000000
|
||||
PyTorch,div,div_M64_N64_K128_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,169.604000,0.000000
|
||||
PyTorch,div,div_M64_N64_K128_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,168.428000,0.000000
|
||||
PyTorch,mul,mul_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,3.928136,0.000000
|
||||
PyTorch,mul,mul_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,7.402000,0.000000
|
||||
PyTorch,mul,mul_M1_N1_K1_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,7.567000,0.000000
|
||||
PyTorch,mul,mul_M1_N1_K1_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,4.020000,0.000000
|
||||
PyTorch,mul,mul_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,56.413499,0.000000
|
||||
PyTorch,mul,mul_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,104.638000,0.000000
|
||||
PyTorch,mul,mul_M64_N64_K64_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,104.335000,0.000000
|
||||
PyTorch,mul,mul_M64_N64_K64_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,48.612000,0.000000
|
||||
PyTorch,mul,mul_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,55.925090,0.000000
|
||||
PyTorch,mul,mul_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,106.110000,0.000000
|
||||
PyTorch,mul,mul_M64_N64_K128_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,106.389000,0.000000
|
||||
PyTorch,mul,mul_M64_N64_K128_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,48.195000,0.000000
|
||||
PyTorch,asr,asr_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,1.989000,0.000000
|
||||
PyTorch,asr,asr_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,7.999000,0.000000
|
||||
PyTorch,asr,asr_M1_N1_K1_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,7.939000,0.000000
|
||||
PyTorch,asr,asr_M1_N1_K1_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,1.980000,0.000000
|
||||
PyTorch,asr,asr_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,54.408000,0.000000
|
||||
PyTorch,asr,asr_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,105.647000,0.000000
|
||||
PyTorch,asr,asr_M64_N64_K64_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,106.476000,0.000000
|
||||
PyTorch,asr,asr_M64_N64_K64_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,48.784000,0.000000
|
||||
PyTorch,asr,asr_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,55.583000,0.000000
|
||||
PyTorch,asr,asr_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,108.083000,0.000000
|
||||
PyTorch,asr,asr_M64_N64_K128_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,107.663000,0.000000
|
||||
PyTorch,asr,asr_M64_N64_K128_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,48.283000,0.000000
|
||||
PyTorch,lsl,lsl_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,1.986000,0.000000
|
||||
PyTorch,lsl,lsl_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,7.676000,0.000000
|
||||
PyTorch,lsl,lsl_M1_N1_K1_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,7.618000,0.000000
|
||||
PyTorch,lsl,lsl_M1_N1_K1_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,1.982000,0.000000
|
||||
PyTorch,lsl,lsl_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,54.698000,0.000000
|
||||
PyTorch,lsl,lsl_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,105.899000,0.000000
|
||||
PyTorch,lsl,lsl_M64_N64_K64_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,106.741000,0.000000
|
||||
PyTorch,lsl,lsl_M64_N64_K64_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,51.182000,0.000000
|
||||
PyTorch,lsl,lsl_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,55.290000,0.000000
|
||||
PyTorch,lsl,lsl_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,107.744000,0.000000
|
||||
PyTorch,lsl,lsl_M64_N64_K128_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,107.820000,0.000000
|
||||
PyTorch,lsl,lsl_M64_N64_K128_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,51.298000,0.000000
|
||||
PyTorch,xor,xor_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,1.988000,0.000000
|
||||
PyTorch,xor,xor_M1_N1_K1_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,7.689000,0.000000
|
||||
PyTorch,xor,xor_M1_N1_K1_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,7.695000,0.000000
|
||||
PyTorch,xor,xor_M1_N1_K1_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,1.978000,0.000000
|
||||
PyTorch,xor,xor_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,54.934000,0.000000
|
||||
PyTorch,xor,xor_M64_N64_K64_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,105.217000,0.000000
|
||||
PyTorch,xor,xor_M64_N64_K64_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,104.215000,0.000000
|
||||
PyTorch,xor,xor_M64_N64_K64_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,47.115000,0.000000
|
||||
PyTorch,xor,xor_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.int32,short,False,55.974000,0.000000
|
||||
PyTorch,xor,xor_M64_N64_K128_cpu_dtype_onetorch.int32_dtype_twotorch.uint8,short,False,106.828000,0.000000
|
||||
PyTorch,xor,xor_M64_N64_K128_cpu_dtype_onetorch.uint8_dtype_twotorch.int32,short,False,106.879000,0.000000
|
||||
PyTorch,xor,xor_M64_N64_K128_cpu_dtype_onetorch.uint8_dtype_twotorch.uint8,short,False,48.197000,0.000000
|
||||
PyTorch,logical_and,"logical_and_in_one[64,1,64]_in_two[1,64,1]_cpu_dtypetorch.bool",short,False,78.404254,0.000000
|
||||
PyTorch,logical_and,logical_and_M1_N1_K1_cpu_dtype_onetorch.bool_dtype_twotorch.bool,short,False,5.354032,0.000000
|
||||
PyTorch,logical_and,logical_and_M64_N64_K64_cpu_dtype_onetorch.bool_dtype_twotorch.bool,short,False,54.072783,0.000000
|
||||
@ -71,6 +143,9 @@ PyTorch,baddbmm,baddbmm_B2_M1_N8_K2_cpu_dtypetorch.float32,short,False,6.631313,
|
||||
PyTorch,baddbmm,baddbmm_B2_M1_N8_K2_cpu_dtypetorch.bfloat16,short,False,6.476986,0.000000
|
||||
PyTorch,baddbmm,baddbmm_B128_M64_N32_K64_cpu_dtypetorch.float32,short,False,266.065131,0.000000
|
||||
PyTorch,baddbmm,baddbmm_B128_M64_N32_K64_cpu_dtypetorch.bfloat16,short,False,295.503063,0.000000
|
||||
PyTorch,all,all_M1_N1_K1_cpu,short,False,5.773000,0.000000
|
||||
PyTorch,all,all_M64_N64_K64_cpu,short,False,89.427000,0.000000
|
||||
PyTorch,all,all_M64_N64_K128_cpu,short,False,120.119000,0.000000
|
||||
PyTorch,cat,"cat_sizes(1,1,1)_N2_dim0_cpu",short,False,4.301950,0.000000
|
||||
PyTorch,cat,"cat_sizes(512,512,2)_N2_dim1_cpu",short,False,99.093415,0.000000
|
||||
PyTorch,cat,"cat_sizes(128,1024,2)_N2_dim1_cpu",short,False,96.771578,0.000000
|
||||
|
||||
|
@ -580,6 +580,9 @@ class BenchmarkRunner:
|
||||
else "unknown"
|
||||
)
|
||||
|
||||
# Extract operator name from test_name
|
||||
operator_name = test_name.split("_")[0]
|
||||
|
||||
# Create the record
|
||||
@dataclass
|
||||
class BenchmarkInfo:
|
||||
@ -593,6 +596,7 @@ class BenchmarkRunner:
|
||||
name: str
|
||||
type: str
|
||||
origins: list[str]
|
||||
extra_info: dict[str, Any]
|
||||
|
||||
@dataclass
|
||||
class MetricInfo:
|
||||
@ -618,10 +622,14 @@ class BenchmarkRunner:
|
||||
"device": device,
|
||||
"arch": device_arch,
|
||||
"use_compile": use_compile,
|
||||
"operator_name": operator_name,
|
||||
},
|
||||
),
|
||||
model=ModelInfo(
|
||||
name=test_name, type="micro-benchmark", origins=["pytorch"]
|
||||
name=test_name,
|
||||
type="micro-benchmark",
|
||||
origins=["pytorch"],
|
||||
extra_info={"operator_name": operator_name},
|
||||
),
|
||||
metric=MetricInfo(
|
||||
name="latency",
|
||||
|
||||
@ -25,7 +25,7 @@ binary_configs_broadcast = op_bench.config_list(
|
||||
],
|
||||
cross_product_configs={
|
||||
"device": ["cpu"],
|
||||
"dtype": [torch.float],
|
||||
"dtype": [torch.float, torch.bfloat16],
|
||||
},
|
||||
tags=["short"],
|
||||
)
|
||||
@ -71,8 +71,8 @@ binary_short_configs = op_bench.config_list(
|
||||
],
|
||||
cross_product_configs={
|
||||
"device": ["cpu", "cuda"],
|
||||
"dtype_one": [torch.int32],
|
||||
"dtype_two": [torch.int32],
|
||||
"dtype_one": [torch.int32, torch.uint8],
|
||||
"dtype_two": [torch.int32, torch.uint8],
|
||||
},
|
||||
tags=["short"],
|
||||
)
|
||||
@ -82,8 +82,8 @@ binary_long_configs = op_bench.cross_product_configs(
|
||||
N=[32, 64],
|
||||
K=[256, 512],
|
||||
device=["cpu", "cuda"],
|
||||
dtype_one=[torch.int8, torch.int32],
|
||||
dtype_two=[torch.int8, torch.int32],
|
||||
dtype_one=[torch.int8, torch.int32, torch.uint8],
|
||||
dtype_two=[torch.int8, torch.int32, torch.uint8],
|
||||
tags=["long"],
|
||||
)
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -176,8 +176,8 @@ THIRD_PARTY_LIBS = {
|
||||
"omp": ["//xplat/third-party/linker_lib:omp", "//third_party:no-op"],
|
||||
"pocketfft": ["//third-party/pocket_fft:pocketfft", "//third_party:pocketfft_header"],
|
||||
"psimd": ["//xplat/third-party/psimd:psimd", "//third_party:psimd"],
|
||||
"pthreadpool": ["//xplat/third-party/pthreadpool:pthreadpool", "//third_party:pthreadpool"],
|
||||
"pthreadpool_header": ["//xplat/third-party/pthreadpool:pthreadpool_header", "//third_party:pthreadpool_header"],
|
||||
"pthreadpool": ["fbsource//xplat/third-party/pthreadpool:pthreadpool", "//third_party:pthreadpool"],
|
||||
"pthreadpool_header": ["fbsource//xplat/third-party/pthreadpool:pthreadpool_header", "//third_party:pthreadpool_header"],
|
||||
"moodycamel": ["//third-party/moodycamel:moodycamel", "//third_party:moodycamel"],
|
||||
"pyyaml": ["//third-party/pypi/pyyaml:pyyaml", "//third_party:pyyaml"],
|
||||
"rt": ["//xplat/third-party/linker_lib:rt", "//third_party:rt"],
|
||||
@ -1729,8 +1729,10 @@ def define_buck_targets(
|
||||
"torch/csrc/jit/backends/backend_debug_info.cpp",
|
||||
"torch/csrc/jit/backends/backend_interface.cpp",
|
||||
],
|
||||
compiler_flags = get_pt_compiler_flags(),
|
||||
fbandroid_compiler_flags = c2_fbandroid_xplat_compiler_flags,
|
||||
compiler_flags = get_pt_compiler_flags() + select({
|
||||
"DEFAULT": [],
|
||||
"ovr_config//os:android": c2_fbandroid_xplat_compiler_flags
|
||||
}),
|
||||
# @lint-ignore BUCKLINT link_whole
|
||||
link_whole = True,
|
||||
linker_flags = get_no_as_needed_linker_flag(),
|
||||
@ -2023,6 +2025,9 @@ def define_buck_targets(
|
||||
"ovr_config//os:android-x86_64": [
|
||||
"-mssse3",
|
||||
],
|
||||
}) + select({
|
||||
"DEFAULT": [],
|
||||
"ovr_config//os:android": c2_fbandroid_xplat_compiler_flags,
|
||||
}),
|
||||
exported_preprocessor_flags = get_aten_preprocessor_flags(),
|
||||
exported_deps = [
|
||||
|
||||
@ -855,6 +855,7 @@ libtorch_python_cuda_core_sources = [
|
||||
"torch/csrc/cuda/Stream.cpp",
|
||||
"torch/csrc/cuda/Graph.cpp",
|
||||
"torch/csrc/cuda/MemPool.cpp",
|
||||
"torch/csrc/cuda/GreenContext.cpp",
|
||||
"torch/csrc/cuda/shared/cudart.cpp",
|
||||
"torch/csrc/cuda/shared/nvtx.cpp",
|
||||
"torch/csrc/cuda/utils.cpp",
|
||||
|
||||
@ -9,6 +9,7 @@
|
||||
|
||||
#include <c10/core/Device.h>
|
||||
#include <c10/core/DeviceType.h>
|
||||
#include <c10/core/alignment.h>
|
||||
#include <c10/macros/Export.h>
|
||||
#include <c10/macros/Macros.h>
|
||||
#include <c10/util/Exception.h>
|
||||
|
||||
@ -13,7 +13,17 @@
|
||||
namespace c10::CachingAllocator {
|
||||
|
||||
// "large" allocations may be packed in 20 MiB blocks
|
||||
const size_t kLargeBuffer = 20971520;
|
||||
constexpr size_t kLargeBuffer = 20971520;
|
||||
// "small" allocations are packed in 2 MiB blocks
|
||||
constexpr size_t kSmallBuffer = 2097152;
|
||||
// all sizes are rounded to at least 512 bytes
|
||||
constexpr size_t kMinBlockSize = 512;
|
||||
// largest "small" allocation is 1 MiB
|
||||
constexpr size_t kSmallSize = 1048576;
|
||||
// allocations between 1 and 10 MiB may use kLargeBuffer
|
||||
constexpr size_t kMinLargeAlloc = 10485760;
|
||||
// round up large allocations to 2 MiB
|
||||
constexpr size_t kRoundLarge = 2097152;
|
||||
|
||||
// A utility class for tokenizing allocator configuration strings into discrete
|
||||
// parts. For example, the config string:
|
||||
|
||||
@ -223,7 +223,7 @@ inline DispatchKey backendToDispatchKey(Backend b) {
|
||||
case Backend::PrivateUse1:
|
||||
return DispatchKey::PrivateUse1;
|
||||
default:
|
||||
throw std::runtime_error("Unknown backend");
|
||||
TORCH_CHECK(false, "Unknown backend");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -52,7 +52,9 @@ constexpr DispatchKeySet math_dispatch_keyset = backend_dispatch_keyset |
|
||||
// where we would like to support composite implicit kernels but not
|
||||
// explicit kernels therefore we manually add the key to the
|
||||
// math_dispatch_keyset
|
||||
DispatchKeySet{DispatchKey::NestedTensor};
|
||||
DispatchKeySet{DispatchKey::NestedTensor} |
|
||||
// Functionalize should always reuse CompositeImplicit decomps.
|
||||
DispatchKeySet{DispatchKey::Functionalize};
|
||||
|
||||
constexpr DispatchKeySet nested_dispatch_keyset =
|
||||
DispatchKeySet(
|
||||
|
||||
@ -336,7 +336,7 @@ class C10_API Scalar {
|
||||
} else if (isBoolean()) {
|
||||
return ScalarType::Bool;
|
||||
} else {
|
||||
throw std::runtime_error("Unknown scalar type.");
|
||||
TORCH_CHECK(false, "Unknown scalar type.");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -228,7 +228,7 @@ std::pair<std::string, std::string> getDtypeNames(c10::ScalarType scalarType) {
|
||||
case c10::ScalarType::Float4_e2m1fn_x2:
|
||||
return std::make_pair("float4_e2m1fn_x2", "");
|
||||
default:
|
||||
throw std::runtime_error("Unimplemented scalar type");
|
||||
TORCH_CHECK(false, "Unimplemented scalar type");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -137,22 +137,6 @@ inline ScalarType toQIntType(ScalarType t) {
|
||||
}
|
||||
}
|
||||
|
||||
inline ScalarType toUnderlying(ScalarType t) {
|
||||
switch (t) {
|
||||
case ScalarType::QUInt8:
|
||||
case ScalarType::QUInt4x2:
|
||||
[[fallthrough]];
|
||||
case ScalarType::QUInt2x4:
|
||||
return ScalarType::Byte;
|
||||
case ScalarType::QInt8:
|
||||
return ScalarType::Char;
|
||||
case ScalarType::QInt32:
|
||||
return ScalarType::Int;
|
||||
default:
|
||||
return t;
|
||||
}
|
||||
}
|
||||
|
||||
inline bool isSignedType(ScalarType t) {
|
||||
#define CASE_ISSIGNED(name) \
|
||||
case ScalarType::name: \
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstddef>
|
||||
#include <new>
|
||||
|
||||
namespace c10 {
|
||||
|
||||
@ -18,4 +19,12 @@ constexpr size_t gPagesize = 4096;
|
||||
// since the default thp pagesize is 2MB, enable thp only
|
||||
// for buffers of size 2MB or larger to avoid memory bloating
|
||||
constexpr size_t gAlloc_threshold_thp = static_cast<size_t>(2) * 1024 * 1024;
|
||||
|
||||
// Cache line size used to avoid false sharing between threads. Falls back to 64
|
||||
// bytes if C++17 feature is unavailable.
|
||||
#ifdef __cpp_lib_hardware_interference_size
|
||||
using std::hardware_destructive_interference_size;
|
||||
#else
|
||||
constexpr std::size_t hardware_destructive_interference_size = 64;
|
||||
#endif
|
||||
} // namespace c10
|
||||
|
||||
@ -87,9 +87,7 @@ bool ThreadPool::inThreadPool() const {
|
||||
}
|
||||
|
||||
void ThreadPool::run(std::function<void()> func) {
|
||||
if (threads_.empty()) {
|
||||
throw std::runtime_error("No threads to run a task");
|
||||
}
|
||||
TORCH_CHECK(threads_.size() > 0, "No threads to run a task");
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
|
||||
// Set task and signal condition variable so that a worker thread will
|
||||
|
||||
@ -131,15 +131,6 @@ namespace Native {
|
||||
* notifyCaptureDestroy.
|
||||
*/
|
||||
|
||||
constexpr size_t kMinBlockSize =
|
||||
512; // all sizes are rounded to at least 512 bytes
|
||||
constexpr size_t kSmallSize = 1048576; // largest "small" allocation is 1 MiB
|
||||
constexpr size_t kSmallBuffer =
|
||||
2097152; // "small" allocations are packed in 2 MiB blocks
|
||||
constexpr size_t kMinLargeAlloc =
|
||||
10485760; // allocations between 1 and 10 MiB may use kLargeBuffer
|
||||
constexpr size_t kRoundLarge = 2097152; // round up large allocations to 2 MiB
|
||||
|
||||
static char SHAREABLE_HANDLE_VERSION = 2;
|
||||
enum ShareableHandleType : char {
|
||||
SHAREABLE_CUDA_MALLOC = 'c',
|
||||
@ -941,7 +932,7 @@ class EventPool {
|
||||
|
||||
private:
|
||||
struct PerDevicePool {
|
||||
alignas(64) std::mutex mutex_;
|
||||
alignas(hardware_destructive_interference_size) std::mutex mutex_;
|
||||
std::vector<std::unique_ptr<cudaEvent_t>> event_pool_;
|
||||
};
|
||||
std::vector<PerDevicePool> pools_;
|
||||
@ -3758,11 +3749,6 @@ static void uncached_delete(void* ptr) {
|
||||
static void local_raw_delete(void* ptr);
|
||||
thread_local std::stack<std::string> DeviceCachingAllocator::compile_context;
|
||||
thread_local std::string DeviceCachingAllocator::user_metadata;
|
||||
#ifdef __cpp_lib_hardware_interference_size
|
||||
using std::hardware_destructive_interference_size;
|
||||
#else
|
||||
static constexpr std::size_t hardware_destructive_interference_size = 64;
|
||||
#endif
|
||||
|
||||
class NativeCachingAllocator : public CUDAAllocator {
|
||||
private:
|
||||
@ -4483,7 +4469,10 @@ struct BackendStaticInitializer {
|
||||
if (key == "backend") {
|
||||
tokenizer.checkToken(++i, ":");
|
||||
i++; // Move to the value after the colon
|
||||
if (tokenizer[i] == "cudaMallocAsync"
|
||||
// break up token to trick hipify
|
||||
if (tokenizer[i] ==
|
||||
"c"
|
||||
"udaMallocAsync"
|
||||
#ifdef USE_ROCM
|
||||
// convenience for ROCm users to allow either CUDA or HIP env var
|
||||
|| tokenizer[i] == "hipMallocAsync"
|
||||
|
||||
@ -913,7 +913,9 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
|
||||
}
|
||||
}
|
||||
std::string name() override {
|
||||
return "cudaMallocAsync";
|
||||
// break up token to trick hipify
|
||||
return "c"
|
||||
"udaMallocAsync";
|
||||
}
|
||||
void copy_data(void* dest, const void* src, std::size_t count) const final {
|
||||
C10_CUDA_CHECK(
|
||||
|
||||
@ -51,6 +51,17 @@
|
||||
|
||||
#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12030)
|
||||
#define C10_LIBCUDA_DRIVER_API_OPTIONAL(_) \
|
||||
_(cuCtxFromGreenCtx, 12080) \
|
||||
_(cuCtxGetCurrent, 12080) \
|
||||
_(cuCtxPopCurrent, 12080) \
|
||||
_(cuCtxPushCurrent, 12080) \
|
||||
_(cuCtxSetCurrent, 12080) \
|
||||
_(cuGreenCtxCreate, 12080) \
|
||||
_(cuGreenCtxDestroy, 12080) \
|
||||
_(cuDevSmResourceSplitByCount, 12080) \
|
||||
_(cuDeviceGet, 12080) \
|
||||
_(cuDeviceGetDevResource, 12080) \
|
||||
_(cuDevResourceGenerateDesc, 12080) \
|
||||
_(cuMulticastAddDevice, 12030) \
|
||||
_(cuMulticastBindMem, 12030) \
|
||||
_(cuMulticastCreate, 12030) \
|
||||
|
||||
@ -18,7 +18,6 @@
|
||||
#include <c10/macros/Macros.h>
|
||||
#include <c10/util/Exception.h>
|
||||
#include <c10/util/SmallVector.h>
|
||||
#include <torch/headeronly/util/HeaderOnlyArrayRef.h>
|
||||
|
||||
#include <array>
|
||||
#include <cstddef>
|
||||
@ -41,106 +40,200 @@ namespace c10 {
|
||||
///
|
||||
/// This is intended to be trivially copyable, so it should be passed by
|
||||
/// value.
|
||||
///
|
||||
/// NOTE: We have refactored out the headeronly parts of the ArrayRef struct
|
||||
/// into HeaderOnlyArrayRef. As adding `virtual` would change the performance of
|
||||
/// the underlying constexpr calls, we rely on apparent-type dispatch for
|
||||
/// inheritance. This should be fine because their memory format is the same,
|
||||
/// and it is never incorrect for ArrayRef to call HeaderOnlyArrayRef methods.
|
||||
/// However, you should prefer to use ArrayRef when possible, because its use
|
||||
/// of TORCH_CHECK will lead to better user-facing error messages.
|
||||
template <typename T>
|
||||
class ArrayRef final : public HeaderOnlyArrayRef<T> {
|
||||
class ArrayRef final {
|
||||
public:
|
||||
/// @name Constructors, all inherited from HeaderOnlyArrayRef except for
|
||||
/// SmallVector.
|
||||
using iterator = const T*;
|
||||
using const_iterator = const T*;
|
||||
using size_type = size_t;
|
||||
using value_type = T;
|
||||
|
||||
using reverse_iterator = std::reverse_iterator<iterator>;
|
||||
|
||||
private:
|
||||
/// The start of the array, in an external buffer.
|
||||
const T* Data;
|
||||
|
||||
/// The number of elements.
|
||||
size_type Length;
|
||||
|
||||
void debugCheckNullptrInvariant() {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
Data != nullptr || Length == 0,
|
||||
"created ArrayRef with nullptr and non-zero length! std::optional relies on this being illegal");
|
||||
}
|
||||
|
||||
public:
|
||||
/// @name Constructors
|
||||
/// @{
|
||||
|
||||
using HeaderOnlyArrayRef<T>::HeaderOnlyArrayRef;
|
||||
/// Construct an empty ArrayRef.
|
||||
/* implicit */ constexpr ArrayRef() : Data(nullptr), Length(0) {}
|
||||
|
||||
/// Construct an ArrayRef from a std::vector.
|
||||
/// This constructor is identical to the one in HeaderOnlyArrayRef, but we
|
||||
/// include it to help with Class Template Argument Deduction (CTAD).
|
||||
/// Without it, CTAD can fail sometimes due to the indirect constructor
|
||||
/// inheritance. So we explicitly include this constructor.
|
||||
template <typename A>
|
||||
/* implicit */ ArrayRef(const std::vector<T, A>& Vec)
|
||||
: HeaderOnlyArrayRef<T>(Vec.data(), Vec.size()) {}
|
||||
/// Construct an ArrayRef from a single element.
|
||||
// TODO Make this explicit
|
||||
constexpr ArrayRef(const T& OneElt) : Data(&OneElt), Length(1) {}
|
||||
|
||||
/// Construct an ArrayRef from a pointer and length.
|
||||
constexpr ArrayRef(const T* data, size_t length)
|
||||
: Data(data), Length(length) {
|
||||
debugCheckNullptrInvariant();
|
||||
}
|
||||
|
||||
/// Construct an ArrayRef from a range.
|
||||
constexpr ArrayRef(const T* begin, const T* end)
|
||||
: Data(begin), Length(end - begin) {
|
||||
debugCheckNullptrInvariant();
|
||||
}
|
||||
|
||||
/// Construct an ArrayRef from a SmallVector. This is templated in order to
|
||||
/// avoid instantiating SmallVectorTemplateCommon<T> whenever we
|
||||
/// copy-construct an ArrayRef.
|
||||
/// NOTE: this is the only constructor that is not inherited from
|
||||
/// HeaderOnlyArrayRef.
|
||||
template <typename U>
|
||||
/* implicit */ ArrayRef(const SmallVectorTemplateCommon<T, U>& Vec)
|
||||
: HeaderOnlyArrayRef<T>(Vec.data(), Vec.size()) {}
|
||||
: Data(Vec.data()), Length(Vec.size()) {
|
||||
debugCheckNullptrInvariant();
|
||||
}
|
||||
|
||||
template <
|
||||
typename Container,
|
||||
typename U = decltype(std::declval<Container>().data()),
|
||||
typename = std::enable_if_t<
|
||||
(std::is_same_v<U, T*> || std::is_same_v<U, T const*>)>>
|
||||
/* implicit */ ArrayRef(const Container& container)
|
||||
: Data(container.data()), Length(container.size()) {
|
||||
debugCheckNullptrInvariant();
|
||||
}
|
||||
|
||||
/// Construct an ArrayRef from a std::vector.
|
||||
// The enable_if stuff here makes sure that this isn't used for
|
||||
// std::vector<bool>, because ArrayRef can't work on a std::vector<bool>
|
||||
// bitfield.
|
||||
template <typename A>
|
||||
/* implicit */ ArrayRef(const std::vector<T, A>& Vec)
|
||||
: Data(Vec.data()), Length(Vec.size()) {
|
||||
static_assert(
|
||||
!std::is_same_v<T, bool>,
|
||||
"ArrayRef<bool> cannot be constructed from a std::vector<bool> bitfield.");
|
||||
}
|
||||
|
||||
/// Construct an ArrayRef from a std::array
|
||||
template <size_t N>
|
||||
/* implicit */ constexpr ArrayRef(const std::array<T, N>& Arr)
|
||||
: Data(Arr.data()), Length(N) {}
|
||||
|
||||
/// Construct an ArrayRef from a C array.
|
||||
template <size_t N>
|
||||
// NOLINTNEXTLINE(*c-arrays*)
|
||||
/* implicit */ constexpr ArrayRef(const T (&Arr)[N]) : Data(Arr), Length(N) {}
|
||||
|
||||
/// Construct an ArrayRef from a std::initializer_list.
|
||||
/* implicit */ constexpr ArrayRef(const std::initializer_list<T>& Vec)
|
||||
: Data(
|
||||
std::begin(Vec) == std::end(Vec) ? static_cast<T*>(nullptr)
|
||||
: std::begin(Vec)),
|
||||
Length(Vec.size()) {}
|
||||
|
||||
/// @}
|
||||
/// @name Simple Operations, mostly inherited from HeaderOnlyArrayRef
|
||||
/// @name Simple Operations
|
||||
/// @{
|
||||
|
||||
constexpr iterator begin() const {
|
||||
return Data;
|
||||
}
|
||||
constexpr iterator end() const {
|
||||
return Data + Length;
|
||||
}
|
||||
|
||||
// These are actually the same as iterator, since ArrayRef only
|
||||
// gives you const iterators.
|
||||
constexpr const_iterator cbegin() const {
|
||||
return Data;
|
||||
}
|
||||
constexpr const_iterator cend() const {
|
||||
return Data + Length;
|
||||
}
|
||||
|
||||
constexpr reverse_iterator rbegin() const {
|
||||
return reverse_iterator(end());
|
||||
}
|
||||
constexpr reverse_iterator rend() const {
|
||||
return reverse_iterator(begin());
|
||||
}
|
||||
|
||||
/// Check if all elements in the array satisfy the given expression
|
||||
constexpr bool allMatch(const std::function<bool(const T&)>& pred) const {
|
||||
return std::all_of(cbegin(), cend(), pred);
|
||||
}
|
||||
|
||||
/// empty - Check if the array is empty.
|
||||
constexpr bool empty() const {
|
||||
return Length == 0;
|
||||
}
|
||||
|
||||
constexpr const T* data() const {
|
||||
return Data;
|
||||
}
|
||||
|
||||
/// size - Get the array size.
|
||||
constexpr size_t size() const {
|
||||
return Length;
|
||||
}
|
||||
|
||||
/// front - Get the first element.
|
||||
/// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
|
||||
/// STD_TORCH_CHECK
|
||||
constexpr const T& front() const {
|
||||
TORCH_CHECK(
|
||||
!this->empty(), "ArrayRef: attempted to access front() of empty list");
|
||||
return this->Data[0];
|
||||
!empty(), "ArrayRef: attempted to access front() of empty list");
|
||||
return Data[0];
|
||||
}
|
||||
|
||||
/// back - Get the last element.
|
||||
/// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
|
||||
/// STD_TORCH_CHECK
|
||||
constexpr const T& back() const {
|
||||
TORCH_CHECK(
|
||||
!this->empty(), "ArrayRef: attempted to access back() of empty list");
|
||||
return this->Data[this->Length - 1];
|
||||
TORCH_CHECK(!empty(), "ArrayRef: attempted to access back() of empty list");
|
||||
return Data[Length - 1];
|
||||
}
|
||||
|
||||
/// equals - Check for element-wise equality.
|
||||
constexpr bool equals(ArrayRef RHS) const {
|
||||
return Length == RHS.Length && std::equal(begin(), end(), RHS.begin());
|
||||
}
|
||||
|
||||
/// slice(n, m) - Take M elements of the array starting at element N
|
||||
/// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
|
||||
/// STD_TORCH_CHECK
|
||||
constexpr ArrayRef<T> slice(size_t N, size_t M) const {
|
||||
TORCH_CHECK(
|
||||
N + M <= this->size(),
|
||||
N + M <= size(),
|
||||
"ArrayRef: invalid slice, N = ",
|
||||
N,
|
||||
"; M = ",
|
||||
M,
|
||||
"; size = ",
|
||||
this->size());
|
||||
return ArrayRef<T>(this->data() + N, M);
|
||||
size());
|
||||
return ArrayRef<T>(data() + N, M);
|
||||
}
|
||||
|
||||
/// slice(n) - Chop off the first N elements of the array.
|
||||
/// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
|
||||
/// STD_TORCH_CHECK
|
||||
constexpr ArrayRef<T> slice(size_t N) const {
|
||||
TORCH_CHECK(
|
||||
N <= this->size(),
|
||||
"ArrayRef: invalid slice, N = ",
|
||||
N,
|
||||
"; size = ",
|
||||
this->size());
|
||||
return slice(N, this->size() - N); // should this slice be this->slice?
|
||||
N <= size(), "ArrayRef: invalid slice, N = ", N, "; size = ", size());
|
||||
return slice(N, size() - N);
|
||||
}
|
||||
|
||||
/// @}
|
||||
/// @name Operator Overloads
|
||||
/// @{
|
||||
constexpr const T& operator[](size_t Index) const {
|
||||
return Data[Index];
|
||||
}
|
||||
|
||||
/// Vector compatibility
|
||||
/// We deviate from HeaderOnlyArrayRef by using TORCH_CHECK instead of
|
||||
/// STD_TORCH_CHECK
|
||||
constexpr const T& at(size_t Index) const {
|
||||
TORCH_CHECK(
|
||||
Index < this->Length,
|
||||
Index < Length,
|
||||
"ArrayRef: invalid index Index = ",
|
||||
Index,
|
||||
"; Length = ",
|
||||
this->Length);
|
||||
return this->Data[Index];
|
||||
Length);
|
||||
return Data[Index];
|
||||
}
|
||||
|
||||
/// Disallow accidental assignment from a temporary.
|
||||
@ -160,6 +253,13 @@ class ArrayRef final : public HeaderOnlyArrayRef<T> {
|
||||
std::enable_if_t<std::is_same_v<U, T>, ArrayRef<T>>& operator=(
|
||||
std::initializer_list<U>) = delete;
|
||||
|
||||
/// @}
|
||||
/// @name Expensive Operations
|
||||
/// @{
|
||||
std::vector<T> vec() const {
|
||||
return std::vector<T>(Data, Data + Length);
|
||||
}
|
||||
|
||||
/// @}
|
||||
};
|
||||
|
||||
|
||||
@ -45,14 +45,7 @@ constexpr bool is_pod_v = is_pod<T>::value;
|
||||
|
||||
namespace guts {
|
||||
|
||||
#if defined(__cpp_lib_apply) && !defined(__CUDA_ARCH__) && !defined(__HIP__)
|
||||
|
||||
template <class F, class Tuple>
|
||||
C10_HOST_DEVICE inline constexpr decltype(auto) apply(F&& f, Tuple&& t) {
|
||||
return std::apply(std::forward<F>(f), std::forward<Tuple>(t));
|
||||
}
|
||||
|
||||
#else
|
||||
#if defined(__HIP__)
|
||||
|
||||
// Implementation from http://en.cppreference.com/w/cpp/utility/apply (but
|
||||
// modified)
|
||||
|
||||
@ -14,16 +14,6 @@ using namespace c10::CachingDeviceAllocator;
|
||||
|
||||
// newly allocated memory with 512-byte alignment.
|
||||
constexpr size_t kDeviceAlignment = 512;
|
||||
// all sizes are rounded to at least 512 bytes
|
||||
constexpr size_t kMinBlockSize = 512;
|
||||
// largest "small" allocation is 1 MiB
|
||||
constexpr size_t kSmallSize = 1048576;
|
||||
// "small" allocations are packed in 2 MiB blocks
|
||||
constexpr size_t kSmallBuffer = 2097152;
|
||||
// allocations between 1 and 10 MiB may use kLargeBuffer
|
||||
constexpr size_t kMinLargeAlloc = 10485760;
|
||||
// round up large allocations to 2 MiB
|
||||
constexpr size_t kRoundLarge = 2097152;
|
||||
|
||||
namespace {
|
||||
using stream_set = ska::flat_hash_set<xpu::XPUStream>;
|
||||
@ -554,7 +544,7 @@ static void local_raw_delete(void* ptr);
|
||||
|
||||
class XPUAllocator : public DeviceAllocator {
|
||||
private:
|
||||
std::mutex mutex;
|
||||
alignas(hardware_destructive_interference_size) std::mutex mutex;
|
||||
ska::flat_hash_map<void*, Block*> allocated_blocks;
|
||||
|
||||
void add_allocated_block(Block* block) {
|
||||
|
||||
@ -607,6 +607,12 @@ if(USE_CUDA)
|
||||
set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
|
||||
endif()
|
||||
endif()
|
||||
if(NOT WIN32)
|
||||
set_source_files_properties(
|
||||
${TORCH_ROOT}/aten/src/ATen/cuda/CUDAGreenContext.cpp
|
||||
PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
|
||||
)
|
||||
endif()
|
||||
set_source_files_properties(
|
||||
${TORCH_ROOT}/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
|
||||
PROPERTIES COMPILE_DEFINITIONS "NVRTC_SHORTHASH=${CUDA_NVRTC_SHORTHASH}"
|
||||
|
||||
@ -16,7 +16,7 @@ find_path(vecLib_INCLUDE_DIR vecLib.h
|
||||
DOC "vecLib include directory"
|
||||
PATHS /System/Library/Frameworks/Accelerate.framework/Versions/Current/${__veclib_include_suffix}
|
||||
/System/Library/${__veclib_include_suffix}
|
||||
/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.9.sdk/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers/
|
||||
/Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers/
|
||||
${CMAKE_OSX_SYSROOT}/System/Library/Frameworks/Accelerate.framework/Versions/Current/${__veclib_include_suffix}
|
||||
NO_DEFAULT_PATH)
|
||||
|
||||
|
||||
@ -224,6 +224,12 @@ AMD/ROCm/HIP
|
||||
- Jithun Nair (`jithunnair-amd <https://github.com/jithunnair-amd>`__)
|
||||
- (emeritus) Junjie Bai (`bddppq <https://github.com/bddppq>`__)
|
||||
|
||||
XPU/Intel GPU
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
- Eikan Wang (`EikanWang <https://github.com/EikanWang>`__)
|
||||
- Guangye Yu (`guangyey <https://github.com/guangyey>`__)
|
||||
|
||||
Build + CI
|
||||
~~~~~~~~~~
|
||||
|
||||
|
||||
@ -258,6 +258,28 @@ See the docs for {class}`~torch.cuda.gds.GdsFile` for an example of how to use t
|
||||
|
||||
```
|
||||
|
||||
## Green Contexts (experimental)
|
||||
|
||||
`torch.cuda.green_contexts` provides thin wrappers around the CUDA Green Context APIs
|
||||
to enable more general carveout of SM resources for CUDA kernels.
|
||||
|
||||
These APIs can be used in PyTorch with CUDA versions greater than or equal to 12.8.
|
||||
|
||||
See the docs for {class}`~torch.cuda.green_contexts.GreenContext` for an example of how to use these.
|
||||
|
||||
```{eval-rst}
|
||||
.. currentmodule:: torch.cuda.green_contexts
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. autosummary::
|
||||
:toctree: generated
|
||||
:nosignatures:
|
||||
|
||||
GreenContext
|
||||
```
|
||||
|
||||
|
||||
% This module needs to be documented. Adding here in the meantime
|
||||
|
||||
% for tracking purposes
|
||||
@ -270,6 +292,10 @@ See the docs for {class}`~torch.cuda.gds.GdsFile` for an example of how to use t
|
||||
.. py:module:: torch.cuda.gds
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. py:module:: torch.cuda.green_contexts
|
||||
```
|
||||
|
||||
```{eval-rst}
|
||||
.. py:module:: torch.cuda.jiterator
|
||||
```
|
||||
|
||||
@ -44,9 +44,9 @@ following invariants. More specifications about the IR can be found
|
||||
- **Normalized**: There are no Python semantics within the graph. Submodules
|
||||
from the original programs are inlined to form one fully flattened
|
||||
computational graph.
|
||||
- **Graph properties**: The graph is purely functional, meaning it does not
|
||||
contain operations with side effects such as mutations or aliasing. It does
|
||||
not mutate any intermediate values, parameters, or buffers.
|
||||
- **Graph properties**: By default, the graph may contain both functional and
|
||||
non-functional operators (including mutations). To obtain a purely functional
|
||||
graph, use `run_decompositions()` which removes mutations and aliasing.
|
||||
- **Metadata**: The graph contains metadata captured during tracing, such as a
|
||||
stacktrace from user's code.
|
||||
|
||||
@ -56,8 +56,8 @@ Under the hood, `torch.export` leverages the following latest technologies:
|
||||
called the Frame Evaluation API to safely trace PyTorch graphs. This
|
||||
provides a massively improved graph capturing experience, with much fewer
|
||||
rewrites needed in order to fully trace the PyTorch code.
|
||||
- **AOT Autograd** provides a functionalized PyTorch graph and ensures the graph
|
||||
is decomposed/lowered to the ATen operator set.
|
||||
- **AOT Autograd** ensures the graph is decomposed/lowered to the ATen operator
|
||||
set. When using `run_decompositions()`, it can also provide functionalization.
|
||||
- **Torch FX (torch.fx)** is the underlying representation of the graph,
|
||||
allowing flexible Python-based transformations.
|
||||
|
||||
@ -444,23 +444,31 @@ saved_exported_program = torch.export.load('exported_program.pt2')
|
||||
|
||||
(training-export)=
|
||||
|
||||
## Export IR, Decompositions
|
||||
## Export IR: Training vs Inference
|
||||
|
||||
The graph produced by `torch.export` returns a graph containing only
|
||||
[ATen operators](https://pytorch.org/cppdocs/#aten), which are the basic unit of
|
||||
computation in PyTorch. As there are over
|
||||
3000 ATen operators, export provides a way to narrow down the operator set used
|
||||
in the graph based on certain characteristics, creating different IRs.
|
||||
computation in PyTorch. Export provides different IR levels based on your use case:
|
||||
|
||||
By default, export produces the most generic IR which contains all ATen
|
||||
operators, including both functional and non-functional operators. A functional
|
||||
operator is one that does not contain any mutations or aliasing of the inputs.
|
||||
| IR Type | How to Obtain | Properties | Operator Count | Use Case |
|
||||
|---------|---------------|------------|----------------|----------|
|
||||
| Training IR | `torch.export.export()` (default) | May contain mutations | ~3000 | Training with autograd |
|
||||
| Inference IR | `ep.run_decompositions(decomp_table={})` | Purely functional | ~2000 | Inference deployment |
|
||||
| Core ATen IR | `ep.run_decompositions(decomp_table=None)` | Purely functional, highly decomposed | ~180 | Minimal backend support |
|
||||
|
||||
### Training IR (Default)
|
||||
|
||||
By default, export produces a **Training IR** which contains all ATen
|
||||
operators, including both functional and non-functional (mutating) operators.
|
||||
A functional operator is one that does not contain any mutations or aliasing
|
||||
of the inputs, while non-functional operators may modify their inputs in-place.
|
||||
You can find a list of all ATen operators
|
||||
[here](https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/native_functions.yaml)
|
||||
and you can inspect if an operator is functional by checking
|
||||
`op._schema.is_mutable`.
|
||||
|
||||
This generic IR can be used to train in eager PyTorch Autograd.
|
||||
This Training IR, which may contain mutations, is designed for training use
|
||||
cases and can be used with eager PyTorch Autograd.
|
||||
|
||||
```{code-cell}
|
||||
import torch
|
||||
@ -480,15 +488,18 @@ ep_for_training = torch.export.export(M(), (torch.randn(1, 1, 3, 3),))
|
||||
print(ep_for_training.graph_module.print_readable(print_output=False))
|
||||
```
|
||||
|
||||
However, if you want to use the IR for inference, or decrease the amount of
|
||||
operators being used, you can lower the graph through the
|
||||
{func}`ExportedProgram.run_decompositions` API. This method decomposes the
|
||||
ATen operators into the ones specified in the decomposition table, and
|
||||
functionalizes the graph.
|
||||
### Inference IR (via run_decompositions)
|
||||
|
||||
By specifying an empty set, we're only performing functionalization, and does
|
||||
not do any additional decompositions. This results in an IR which contains ~2000
|
||||
operators (instead of the 3000 operators above), and is ideal for inference cases.
|
||||
To obtain an **Inference IR** suitable for deployment, use the
|
||||
{func}`ExportedProgram.run_decompositions` API. This method automatically:
|
||||
1. Functionalizes the graph (removes all mutations and converts them to functional equivalents)
|
||||
2. Optionally decomposes ATen operators based on the provided decomposition table
|
||||
|
||||
This produces a purely functional graph ideal for inference scenarios.
|
||||
|
||||
By specifying an empty decomposition table (`decomp_table={}`), you get just
|
||||
the functionalization without additional decompositions. This produces an
|
||||
Inference IR with ~2000 functional operators (compared to 3000+ in Training IR).
|
||||
|
||||
```{code-cell}
|
||||
import torch
|
||||
@ -514,11 +525,14 @@ As we can see, the previously in-place operator,
|
||||
`torch.ops.aten.add_.default` has now been replaced with
|
||||
`torch.ops.aten.add.default`, a functional operator.
|
||||
|
||||
We can also further lower this exported program to an operator set which only
|
||||
contains the
|
||||
### Core ATen IR
|
||||
|
||||
We can further lower the Inference IR to the
|
||||
`Core ATen Operator Set <https://pytorch.org/docs/main/torch.compiler_ir.html#core-aten-ir>`__,
|
||||
which is a collection of only ~180 operators. This IR is optimal for backends
|
||||
who do not want to reimplement all ATen operators.
|
||||
which contains only ~180 operators. This is achieved by passing `decomp_table=None`
|
||||
(which uses the default decomposition table) to `run_decompositions()`. This IR
|
||||
is optimal for backends who want to minimize the number of operators they need
|
||||
to implement.
|
||||
|
||||
```{code-cell}
|
||||
import torch
|
||||
|
||||
@ -208,11 +208,13 @@ select = [
|
||||
"PLC1802", # len({expression}) used as condition without comparison
|
||||
"PLC0205", # string as __slots__
|
||||
"PLC3002", # unnecessary-direct-lambda-call
|
||||
"PLC0414", # Import alias does not rename original package
|
||||
"PLE",
|
||||
"PLR0133", # constant comparison
|
||||
"PLR0206", # property with params
|
||||
"PLR1722", # use sys exit
|
||||
"PLR1736", # unnecessary list index
|
||||
"PLW0127", # Self-assignment of variable
|
||||
"PLW0129", # assert on string literal
|
||||
"PLW0131", # named expr without context
|
||||
"PLW0133", # useless exception statement
|
||||
|
||||
@ -23,10 +23,12 @@ project-includes = [
|
||||
project-excludes = [
|
||||
# ==== below will be enabled directory by directory ====
|
||||
# ==== to test Pyrefly on a specific directory, simply comment it out ====
|
||||
"torch/_inductor/runtime",
|
||||
"torch/_inductor/codegen/triton.py",
|
||||
"tools/linter/adapters/test_device_bias_linter.py",
|
||||
"tools/code_analyzer/gen_operators_yaml.py",
|
||||
"torch/_inductor/runtime/triton_heuristics.py",
|
||||
"torch/_inductor/runtime/triton_helpers.py",
|
||||
"torch/_inductor/runtime/halide_helpers.py",
|
||||
# formatting issues, will turn on after adjusting where suppressions can be
|
||||
# in import statements
|
||||
"tools/flight_recorder/components/types.py",
|
||||
|
||||
@ -7,7 +7,6 @@ set(AOTI_ABI_CHECK_TEST_SRCS
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_devicetype.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_dtype.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_exception.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_headeronlyarrayref.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_macros.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_math.cpp
|
||||
${AOTI_ABI_CHECK_TEST_ROOT}/test_rand.cpp
|
||||
|
||||
@ -1,52 +0,0 @@
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <torch/headeronly/util/HeaderOnlyArrayRef.h>
|
||||
|
||||
#include <vector>
|
||||
|
||||
using torch::headeronly::HeaderOnlyArrayRef;
|
||||
|
||||
TEST(TestHeaderOnlyArrayRef, TestEmpty) {
|
||||
HeaderOnlyArrayRef<float> arr;
|
||||
ASSERT_TRUE(arr.empty());
|
||||
}
|
||||
|
||||
TEST(TestHeaderOnlyArrayRef, TestSingleton) {
|
||||
float val = 5.0f;
|
||||
HeaderOnlyArrayRef<float> arr(val);
|
||||
ASSERT_FALSE(arr.empty());
|
||||
EXPECT_EQ(arr.size(), 1);
|
||||
EXPECT_EQ(arr[0], val);
|
||||
}
|
||||
|
||||
TEST(TestHeaderOnlyArrayRef, TestAPIs) {
|
||||
std::vector<int> vec = {1, 2, 3, 4, 5, 6, 7};
|
||||
HeaderOnlyArrayRef<int> arr(vec);
|
||||
ASSERT_FALSE(arr.empty());
|
||||
EXPECT_EQ(arr.size(), 7);
|
||||
for (size_t i = 0; i < arr.size(); i++) {
|
||||
EXPECT_EQ(arr[i], i + 1);
|
||||
EXPECT_EQ(arr.at(i), i + 1);
|
||||
}
|
||||
EXPECT_EQ(arr.front(), 1);
|
||||
EXPECT_EQ(arr.back(), 7);
|
||||
ASSERT_TRUE(arr.slice(3, 4).equals(arr.slice(3)));
|
||||
}
|
||||
|
||||
TEST(TestHeaderOnlyArrayRef, TestFromInitializerList) {
|
||||
std::vector<int> vec = {1, 2, 3, 4, 5, 6, 7};
|
||||
HeaderOnlyArrayRef<int> arr({1, 2, 3, 4, 5, 6, 7});
|
||||
auto res_vec = arr.vec();
|
||||
for (size_t i = 0; i < vec.size(); i++) {
|
||||
EXPECT_EQ(vec[i], res_vec[i]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(TestHeaderOnlyArrayRef, TestFromRange) {
|
||||
std::vector<int> vec = {1, 2, 3, 4, 5, 6, 7};
|
||||
HeaderOnlyArrayRef<int> arr(vec.data() + 3, vec.data() + 7);
|
||||
auto res_vec = arr.vec();
|
||||
for (size_t i = 0; i < res_vec.size(); i++) {
|
||||
EXPECT_EQ(vec[i + 3], res_vec[i]);
|
||||
}
|
||||
}
|
||||
@ -74,3 +74,19 @@ TEST(TestScalarType, operator_left_shift) {
|
||||
AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_CHECK);
|
||||
#undef DEFINE_CHECK
|
||||
}
|
||||
|
||||
TEST(TestScalarType, toUnderlying) {
|
||||
using torch::headeronly::ScalarType;
|
||||
using torch::headeronly::toUnderlying;
|
||||
|
||||
EXPECT_EQ(toUnderlying(ScalarType::QUInt8), ScalarType::Byte);
|
||||
EXPECT_EQ(toUnderlying(ScalarType::QUInt4x2), ScalarType::Byte);
|
||||
EXPECT_EQ(toUnderlying(ScalarType::QUInt2x4), ScalarType::Byte);
|
||||
EXPECT_EQ(toUnderlying(ScalarType::QInt8), ScalarType::Char);
|
||||
EXPECT_EQ(toUnderlying(ScalarType::QInt32), ScalarType::Int);
|
||||
#define DEFINE_CHECK(_, name) \
|
||||
EXPECT_EQ(toUnderlying(ScalarType::name), ScalarType::name);
|
||||
AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CHECK);
|
||||
AT_FORALL_FLOAT8_TYPES(DEFINE_CHECK);
|
||||
#undef DEFINE_CHECK
|
||||
}
|
||||
|
||||
@ -311,9 +311,10 @@ void boxed_fill_infinity(
|
||||
}
|
||||
|
||||
Tensor my_pad(Tensor t) {
|
||||
std::vector<int64_t> padding = {1, 2, 2, 1};
|
||||
std::string mode = "constant";
|
||||
double value = 0.0;
|
||||
return pad(t, {1, 2, 2, 1}, mode, value);
|
||||
return pad(t, padding, mode, value);
|
||||
}
|
||||
|
||||
void boxed_my_pad(
|
||||
@ -341,9 +342,6 @@ void boxed_my_narrow(
|
||||
}
|
||||
|
||||
Tensor my_new_empty_dtype_variant(Tensor t) {
|
||||
// Still using a std::vector below even though people can just pass in an
|
||||
// initializer list (which will be implicitly converted to an HeaderOnlyArrayRef)
|
||||
// directly.
|
||||
std::vector<int64_t> sizes = {2, 5};
|
||||
auto dtype = std::make_optional(torch::headeronly::ScalarType::BFloat16);
|
||||
return new_empty(t, sizes, dtype);
|
||||
@ -355,8 +353,9 @@ void boxed_my_new_empty_dtype_variant(StableIValue* stack, uint64_t num_args, ui
|
||||
}
|
||||
|
||||
Tensor my_new_zeros_dtype_variant(Tensor t) {
|
||||
std::vector<int64_t> sizes = {2, 5};
|
||||
auto dtype = std::make_optional(at::ScalarType::Float);
|
||||
return new_zeros(t, {2, 5}, dtype);
|
||||
return new_zeros(t, sizes, dtype);
|
||||
}
|
||||
|
||||
void boxed_my_new_zeros_dtype_variant(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
@ -430,7 +429,8 @@ void boxed_my_amax(StableIValue* stack, uint64_t num_args, uint64_t num_outputs)
|
||||
}
|
||||
|
||||
Tensor my_amax_vec(Tensor t) {
|
||||
return amax(t, {0,1}, false);
|
||||
std::vector<int64_t> v = {0,1};
|
||||
return amax(t, v, false);
|
||||
}
|
||||
|
||||
void boxed_my_amax_vec(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
|
||||
|
||||
@ -1166,7 +1166,7 @@ class TestFullyShardPrefetch(FSDPTest):
|
||||
loss = model(inp)
|
||||
events.clear()
|
||||
loss.sum().backward()
|
||||
expected_backward_events = expected_backward_events = [
|
||||
expected_backward_events = [
|
||||
("unshard", "norm, output", TrainingState.PRE_BACKWARD),
|
||||
# root explicit prefetch layers.2
|
||||
("unshard", "layers.2", TrainingState.PRE_BACKWARD),
|
||||
|
||||
@ -67,7 +67,21 @@ class TestFullyShardMemory(FSDPTest):
|
||||
# allocate the cuBLAS workspaces before measuring the memory usage
|
||||
# since the workspace size can differ between hardwares
|
||||
lin = torch.nn.Linear(768, 768, device=device_type)
|
||||
inp = torch.randn(1, 768, device=device_type)
|
||||
# NOTE: before https://github.com/pytorch/pytorch/pull/163955,
|
||||
# the input shape was (1, 768), so that the forward gemm used
|
||||
# cublaslt, and the backward used cublas.
|
||||
# With the aforementioned PR, and with shape (1, 768),
|
||||
# the cublas path is used both in forward and in backward,
|
||||
# altering peak memory usage not accounting for cublaslt.
|
||||
# Here we change the input shape to (2, 768), and that swaps
|
||||
# the cublas/cublaslt selection in the forward/backward,
|
||||
# but that does not affect the peak memory usage stored in `base_mem_mb`.
|
||||
# Reasons for the flip:
|
||||
# before PR: no Lt in addmm when mat2 has nrows/ncols <= 1,
|
||||
# after PR: no Lt in addmm when either mat1 or mat2 have nrows/ncols <= 1,
|
||||
# since the input preparation can swap matrices based on output
|
||||
# row-/col-majorness.
|
||||
inp = torch.randn(2, 768, device=device_type)
|
||||
lin(inp).sum().backward()
|
||||
torch.get_device_module(device_type).empty_cache()
|
||||
base_mem_mb = self._get_peak_active_memory_mb()
|
||||
|
||||
@ -127,8 +127,9 @@ def echo1(msg: str, exitcode: int = 0) -> str:
|
||||
print(f"exit {exitcode} from {rank}", file=sys.stderr)
|
||||
sys.exit(exitcode)
|
||||
else:
|
||||
print(f"{msg} stdout from {rank}")
|
||||
print(f"{msg} stderr from {rank}", file=sys.stderr)
|
||||
for m in msg.split(","):
|
||||
print(f"{m} stdout from {rank}")
|
||||
print(f"{m} stderr from {rank}", file=sys.stderr)
|
||||
return f"{msg}_{rank}"
|
||||
|
||||
|
||||
@ -247,6 +248,13 @@ class _StartProcessesTest(TestCase):
|
||||
for line in expected:
|
||||
self.assertIn(line, actual)
|
||||
|
||||
def assert_not_in_file(self, lines: list[str], filename: str) -> None:
|
||||
lines = [f"{line.rstrip()}\n" for line in lines]
|
||||
with open(filename) as fp:
|
||||
actual = fp.readlines()
|
||||
for line in lines:
|
||||
self.assertNotIn(line, actual)
|
||||
|
||||
def assert_pids_noexist(self, pids: dict[int, int]):
|
||||
for local_rank, pid in pids.items():
|
||||
with self.assertRaises(
|
||||
@ -360,8 +368,8 @@ if not (TEST_WITH_DEV_DBG_ASAN or IS_WINDOWS or IS_MACOS):
|
||||
|
||||
self.assertIsNone(pc.wait(timeout=0.1, period=0.01))
|
||||
self.assertIsNotNone(pc.wait(period=0.1))
|
||||
self.assertTrue(pc._stderr_tail.stopped())
|
||||
self.assertTrue(pc._stdout_tail.stopped())
|
||||
for tail_log in pc._tail_logs:
|
||||
self.assertTrue(tail_log.stopped())
|
||||
|
||||
def test_pcontext_wait_on_a_child_thread(self):
|
||||
asyncio.run(asyncio.to_thread(self.test_pcontext_wait))
|
||||
@ -379,8 +387,8 @@ if not (TEST_WITH_DEV_DBG_ASAN or IS_WINDOWS or IS_MACOS):
|
||||
pids = pc.pids()
|
||||
pc.close()
|
||||
self.assert_pids_noexist(pids)
|
||||
self.assertTrue(pc._stderr_tail.stopped())
|
||||
self.assertTrue(pc._stdout_tail.stopped())
|
||||
for tail_log in pc._tail_logs:
|
||||
self.assertTrue(tail_log.stopped())
|
||||
|
||||
def test_function_with_tensor(self):
|
||||
for start_method in self._start_methods:
|
||||
@ -482,8 +490,8 @@ if not (TEST_WITH_DEV_DBG_ASAN or IS_WINDOWS or IS_MACOS):
|
||||
int(error_file_data["message"]["extraInfo"]["timestamp"]),
|
||||
int(failure.timestamp),
|
||||
)
|
||||
self.assertTrue(pc._stderr_tail.stopped())
|
||||
self.assertTrue(pc._stdout_tail.stopped())
|
||||
for tail_log in pc._tail_logs:
|
||||
self.assertTrue(tail_log.stopped())
|
||||
|
||||
def test_wait_for_all_child_procs_to_exit(self):
|
||||
"""
|
||||
@ -580,8 +588,8 @@ if not (TEST_WITH_DEV_DBG_ASAN or IS_WINDOWS or IS_MACOS):
|
||||
self.assert_in_file([], results.stdouts[0])
|
||||
self.assertFalse(results.stderrs[1])
|
||||
self.assertFalse(results.stdouts[1])
|
||||
self.assertTrue(pc._stderr_tail.stopped())
|
||||
self.assertTrue(pc._stdout_tail.stopped())
|
||||
for tail_log in pc._tail_logs:
|
||||
self.assertTrue(tail_log.stopped())
|
||||
|
||||
failure = results.failures[1]
|
||||
self.assertEqual(-15, failure.exitcode)
|
||||
@ -731,8 +739,37 @@ if not (TEST_WITH_DEV_DBG_ASAN or IS_WINDOWS or IS_MACOS):
|
||||
self.assert_in_file(["hello stderr from 0"], pc.stderrs[0])
|
||||
self.assert_in_file(["world stderr from 1"], pc.stderrs[1])
|
||||
self.assertFalse(pc.stdouts[1])
|
||||
self.assertTrue(pc._stderr_tail.stopped())
|
||||
self.assertTrue(pc._stdout_tail.stopped())
|
||||
for tail_log in pc._tail_logs:
|
||||
self.assertTrue(tail_log.stopped())
|
||||
|
||||
def test_binary_duplicate_log_filters(self):
|
||||
pc = start_processes(
|
||||
name="trainer",
|
||||
entrypoint=bin("echo1.py"),
|
||||
args={0: ("helloA,helloB",), 1: ("worldA,worldB",)},
|
||||
envs={0: {"RANK": "0"}, 1: {"RANK": "1"}},
|
||||
logs_specs=DefaultLogsSpecs(
|
||||
log_dir=self.log_dir(),
|
||||
redirects={0: Std.ERR, 1: Std.NONE},
|
||||
tee={0: Std.OUT, 1: Std.ERR},
|
||||
),
|
||||
log_line_prefixes={0: "[rank0]:", 1: "[rank1]:"},
|
||||
duplicate_stdout_filters=["helloA"],
|
||||
duplicate_stderr_filters=["worldA", "B"],
|
||||
start_method="spawn",
|
||||
)
|
||||
|
||||
result = pc.wait()
|
||||
|
||||
self.assertFalse(result.is_failed())
|
||||
self.assert_in_file(["[rank0]:helloA stdout from 0"], pc.filtered_stdout)
|
||||
self.assert_not_in_file(
|
||||
["[rank0]:helloB stdout from 0"], pc.filtered_stdout
|
||||
)
|
||||
self.assert_in_file(["[rank1]:worldA stderr from 1"], pc.filtered_stderr)
|
||||
self.assert_in_file(["[rank1]:worldB stderr from 1"], pc.filtered_stderr)
|
||||
for tail_log in pc._tail_logs:
|
||||
self.assertTrue(tail_log.stopped())
|
||||
|
||||
|
||||
# tests incompatible with tsan or asan, the redirect functionality does not work on macos or windows
|
||||
@ -794,8 +831,44 @@ if not (TEST_WITH_DEV_DBG_ASAN or IS_WINDOWS or IS_MACOS or IS_CI):
|
||||
self.assert_in_file(["hello stderr from 0"], pc.stderrs[0])
|
||||
self.assert_in_file(["world stderr from 1"], pc.stderrs[1])
|
||||
self.assertFalse(pc.stdouts[1])
|
||||
self.assertTrue(pc._stderr_tail.stopped())
|
||||
self.assertTrue(pc._stdout_tail.stopped())
|
||||
for tail_log in pc._tail_logs:
|
||||
self.assertTrue(tail_log.stopped())
|
||||
|
||||
def test_function_duplicate_log_filters(self):
|
||||
for start_method in self._start_methods:
|
||||
with self.subTest(start_method=start_method):
|
||||
pc = start_processes(
|
||||
name="trainer",
|
||||
entrypoint=echo1,
|
||||
args={0: ("helloA,helloB",), 1: ("worldA,worldB",)},
|
||||
envs={0: {"RANK": "0"}, 1: {"RANK": "1"}},
|
||||
logs_specs=DefaultLogsSpecs(
|
||||
log_dir=self.log_dir(),
|
||||
redirects={0: Std.ERR, 1: Std.NONE},
|
||||
tee={0: Std.OUT, 1: Std.ERR},
|
||||
),
|
||||
duplicate_stdout_filters=["helloA"],
|
||||
duplicate_stderr_filters=["worldA", "B"],
|
||||
start_method="spawn",
|
||||
)
|
||||
|
||||
result = pc.wait()
|
||||
|
||||
self.assertFalse(result.is_failed())
|
||||
self.assert_in_file(
|
||||
["[trainer0]:helloA stdout from 0"], pc.filtered_stdout
|
||||
)
|
||||
self.assert_not_in_file(
|
||||
["[trainer0]:helloB stdout from 0"], pc.filtered_stdout
|
||||
)
|
||||
self.assert_in_file(
|
||||
["[trainer1]:worldA stderr from 1"], pc.filtered_stderr
|
||||
)
|
||||
self.assert_in_file(
|
||||
["[trainer1]:worldB stderr from 1"], pc.filtered_stderr
|
||||
)
|
||||
for tail_log in pc._tail_logs:
|
||||
self.assertTrue(tail_log.stopped())
|
||||
|
||||
def test_function(self):
|
||||
for start_method, redirs in product(self._start_methods, redirects_all()):
|
||||
@ -880,8 +953,8 @@ if not (TEST_WITH_DEV_DBG_ASAN or IS_WINDOWS or IS_MACOS or IS_CI):
|
||||
self.assertFalse(results.stdouts[0])
|
||||
self.assertFalse(results.stderrs[1])
|
||||
self.assertFalse(results.stdouts[1])
|
||||
self.assertTrue(pc._stderr_tail.stopped())
|
||||
self.assertTrue(pc._stdout_tail.stopped())
|
||||
for tail_log in pc._tail_logs:
|
||||
self.assertTrue(tail_log.stopped())
|
||||
|
||||
def test_no_zombie_process_function(self):
|
||||
signals = [signal.SIGTERM, signal.SIGINT, signal.SIGHUP, signal.SIGQUIT]
|
||||
|
||||
@ -23,5 +23,6 @@ if __name__ == "__main__":
|
||||
print(f"exit {exitcode} from {rank}", file=sys.stderr)
|
||||
sys.exit(exitcode)
|
||||
else:
|
||||
print(f"{args.msg} stdout from {rank}")
|
||||
print(f"{args.msg} stderr from {rank}", file=sys.stderr)
|
||||
for msg in args.msg.split(","):
|
||||
print(f"{msg} stdout from {rank}")
|
||||
print(f"{msg} stderr from {rank}", file=sys.stderr)
|
||||
|
||||
@ -84,6 +84,53 @@ class TailLogTest(unittest.TestCase):
|
||||
)
|
||||
self.assertTrue(tail.stopped())
|
||||
|
||||
def test_tail_write_to_dst_file(self):
|
||||
"""
|
||||
writer() writes 0 - max (on number on each line) to a log file.
|
||||
Run nprocs such writers and tail the log files into a temp file
|
||||
and validate that all lines are accounted for.
|
||||
"""
|
||||
nprocs = 32
|
||||
max = 1000
|
||||
interval_sec = 0.0001
|
||||
|
||||
log_files = {
|
||||
local_rank: os.path.join(self.test_dir, f"{local_rank}_stdout.log")
|
||||
for local_rank in range(nprocs)
|
||||
}
|
||||
|
||||
dst = os.path.join(self.test_dir, "tailed_stdout.log")
|
||||
tail = TailLog(
|
||||
name="writer", log_files=log_files, dst=dst, interval_sec=interval_sec
|
||||
).start()
|
||||
# sleep here is intentional to ensure that the log tail
|
||||
# can gracefully handle and wait for non-existent log files
|
||||
time.sleep(interval_sec * 10)
|
||||
|
||||
futs = []
|
||||
for local_rank, file in log_files.items():
|
||||
f = self.threadpool.submit(
|
||||
write, max=max, sleep=interval_sec * local_rank, file=file
|
||||
)
|
||||
futs.append(f)
|
||||
|
||||
wait(futs, return_when=ALL_COMPLETED)
|
||||
self.assertFalse(tail.stopped())
|
||||
tail.stop()
|
||||
|
||||
actual: dict[int, set[int]] = {}
|
||||
with open(dst) as dst_file:
|
||||
for line in dst_file:
|
||||
header, num = line.split(":")
|
||||
nums = actual.setdefault(header, set())
|
||||
nums.add(int(num))
|
||||
|
||||
self.assertEqual(nprocs, len(actual))
|
||||
self.assertEqual(
|
||||
{f"[writer{i}]": set(range(max)) for i in range(nprocs)}, actual
|
||||
)
|
||||
self.assertTrue(tail.stopped())
|
||||
|
||||
def test_tail_with_custom_prefix(self):
|
||||
"""
|
||||
writer() writes 0 - max (on number on each line) to a log file.
|
||||
@ -131,6 +178,52 @@ class TailLogTest(unittest.TestCase):
|
||||
self.assertIn(f"[worker{i}][{i}]", headers)
|
||||
self.assertTrue(tail.stopped())
|
||||
|
||||
def test_tail_with_custom_filter(self):
|
||||
"""
|
||||
writer() writes 0 - max (on number on each line) to a log file.
|
||||
Run nprocs such writers and tail the log files into an IOString
|
||||
and validate that all lines are accounted for.
|
||||
"""
|
||||
nprocs = 3
|
||||
max = 20
|
||||
interval_sec = 0.0001
|
||||
|
||||
log_files = {
|
||||
local_rank: os.path.join(self.test_dir, f"{local_rank}_stdout.log")
|
||||
for local_rank in range(nprocs)
|
||||
}
|
||||
|
||||
dst = io.StringIO()
|
||||
tail = TailLog(
|
||||
"writer",
|
||||
log_files,
|
||||
dst,
|
||||
interval_sec=interval_sec,
|
||||
log_line_filter=lambda line: "2" in line, # only print lines containing '2'
|
||||
).start()
|
||||
# sleep here is intentional to ensure that the log tail
|
||||
# can gracefully handle and wait for non-existent log files
|
||||
time.sleep(interval_sec * 10)
|
||||
futs = []
|
||||
for local_rank, file in log_files.items():
|
||||
f = self.threadpool.submit(
|
||||
write, max=max, sleep=interval_sec * local_rank, file=file
|
||||
)
|
||||
futs.append(f)
|
||||
wait(futs, return_when=ALL_COMPLETED)
|
||||
self.assertFalse(tail.stopped())
|
||||
tail.stop()
|
||||
dst.seek(0)
|
||||
|
||||
actual: dict[int, set[int]] = {}
|
||||
for line in dst.readlines():
|
||||
header, num = line.split(":")
|
||||
nums = actual.setdefault(header, set())
|
||||
nums.add(int(num))
|
||||
self.assertEqual(nprocs, len(actual))
|
||||
self.assertEqual({f"[writer{i}]": {2, 12} for i in range(nprocs)}, actual)
|
||||
self.assertTrue(tail.stopped())
|
||||
|
||||
def test_tail_no_files(self):
|
||||
"""
|
||||
Ensures that the log tail can gracefully handle no log files
|
||||
|
||||
@ -55,9 +55,10 @@ class SignalHandlingTest(TestCase):
|
||||
mock_threading.main_thread.return_value
|
||||
)
|
||||
mock_pcontext = MagicMock(spec=PContext)
|
||||
# Mock the _stdout_tail and _stderr_tail attributes
|
||||
mock_pcontext._stdout_tail = MagicMock()
|
||||
mock_pcontext._stderr_tail = MagicMock()
|
||||
# Mock the stdout_tail and stderr_tail
|
||||
mock_stdout_tail = MagicMock()
|
||||
mock_stderr_tail = MagicMock()
|
||||
mock_pcontext._tail_logs = [mock_stdout_tail, mock_stderr_tail]
|
||||
|
||||
# Remove environment variable if it exists to test default behavior
|
||||
if "TORCHELASTIC_SIGNALS_TO_HANDLE" in os.environ:
|
||||
@ -84,8 +85,8 @@ class SignalHandlingTest(TestCase):
|
||||
# Verify _start was called
|
||||
mock_pcontext._start.assert_called_once()
|
||||
# Verify _stdout_tail.start() and _stderr_tail.start() were called
|
||||
mock_pcontext._stdout_tail.start.assert_called_once()
|
||||
mock_pcontext._stderr_tail.start.assert_called_once()
|
||||
mock_stdout_tail.start.assert_called_once()
|
||||
mock_stderr_tail.start.assert_called_once()
|
||||
|
||||
@patch("torch.distributed.elastic.multiprocessing.api.threading")
|
||||
@patch("torch.distributed.elastic.multiprocessing.api.signal")
|
||||
@ -99,9 +100,10 @@ class SignalHandlingTest(TestCase):
|
||||
mock_threading.main_thread.return_value
|
||||
)
|
||||
mock_pcontext = MagicMock(spec=PContext)
|
||||
# Mock the _stdout_tail and _stderr_tail attributes
|
||||
mock_pcontext._stdout_tail = MagicMock()
|
||||
mock_pcontext._stderr_tail = MagicMock()
|
||||
# Mock the stdout_tail and stderr_tail
|
||||
mock_stdout_tail = MagicMock()
|
||||
mock_stderr_tail = MagicMock()
|
||||
mock_pcontext._tail_logs = [mock_stdout_tail, mock_stderr_tail]
|
||||
|
||||
# Set custom signals in the environment variable
|
||||
os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"] = "SIGTERM,SIGUSR1,SIGUSR2"
|
||||
@ -139,9 +141,10 @@ class SignalHandlingTest(TestCase):
|
||||
mock_threading.main_thread.return_value
|
||||
)
|
||||
mock_pcontext = MagicMock(spec=PContext)
|
||||
# Mock the _stdout_tail and _stderr_tail attributes
|
||||
mock_pcontext._stdout_tail = MagicMock()
|
||||
mock_pcontext._stderr_tail = MagicMock()
|
||||
# Mock the stdout_tail and stderr_tail
|
||||
mock_stdout_tail = MagicMock()
|
||||
mock_stderr_tail = MagicMock()
|
||||
mock_pcontext._tail_logs = [mock_stdout_tail, mock_stderr_tail]
|
||||
|
||||
# Set invalid signals in the environment variable
|
||||
os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"] = "SIGTERM,INVALID_SIGNAL"
|
||||
@ -180,9 +183,10 @@ class SignalHandlingTest(TestCase):
|
||||
mock_threading.main_thread.return_value
|
||||
)
|
||||
mock_pcontext = MagicMock(spec=PContext)
|
||||
# Mock the _stdout_tail and _stderr_tail attributes
|
||||
mock_pcontext._stdout_tail = MagicMock()
|
||||
mock_pcontext._stderr_tail = MagicMock()
|
||||
# Mock the stdout_tail and stderr_tail
|
||||
mock_stdout_tail = MagicMock()
|
||||
mock_stderr_tail = MagicMock()
|
||||
mock_pcontext._tail_logs = [mock_stdout_tail, mock_stderr_tail]
|
||||
|
||||
# Set signals including ones not supported on Windows
|
||||
os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"] = "SIGTERM,SIGHUP,SIGUSR1"
|
||||
@ -234,9 +238,10 @@ class SignalHandlingTest(TestCase):
|
||||
mock_threading.current_thread.return_value = MagicMock() # Not the main thread
|
||||
mock_threading.main_thread.return_value = MagicMock()
|
||||
mock_pcontext = MagicMock(spec=PContext)
|
||||
# Mock the _stdout_tail and _stderr_tail attributes
|
||||
mock_pcontext._stdout_tail = MagicMock()
|
||||
mock_pcontext._stderr_tail = MagicMock()
|
||||
# Mock the stdout_tail and stderr_tail
|
||||
mock_stdout_tail = MagicMock()
|
||||
mock_stderr_tail = MagicMock()
|
||||
mock_pcontext._tail_logs = [mock_stdout_tail, mock_stderr_tail]
|
||||
|
||||
# Call the start method
|
||||
PContext.start(mock_pcontext)
|
||||
@ -262,9 +267,10 @@ class SignalHandlingTest(TestCase):
|
||||
mock_threading.main_thread.return_value
|
||||
)
|
||||
mock_pcontext = MagicMock(spec=PContext)
|
||||
# Mock the _stdout_tail and _stderr_tail attributes
|
||||
mock_pcontext._stdout_tail = MagicMock()
|
||||
mock_pcontext._stderr_tail = MagicMock()
|
||||
# Mock the stdout_tail and stderr_tail
|
||||
mock_stdout_tail = MagicMock()
|
||||
mock_stderr_tail = MagicMock()
|
||||
mock_pcontext._tail_logs = [mock_stdout_tail, mock_stderr_tail]
|
||||
|
||||
# Set environment variable to include SIGUSR1 and SIGUSR2
|
||||
os.environ["TORCHELASTIC_SIGNALS_TO_HANDLE"] = "SIGUSR1,SIGUSR2"
|
||||
@ -323,8 +329,8 @@ class SignalHandlingTest(TestCase):
|
||||
# Verify _start was called
|
||||
mock_pcontext._start.assert_called_once()
|
||||
# Verify _stdout_tail.start() and _stderr_tail.start() were called
|
||||
mock_pcontext._stdout_tail.start.assert_called_once()
|
||||
mock_pcontext._stderr_tail.start.assert_called_once()
|
||||
mock_stdout_tail.start.assert_called_once()
|
||||
mock_stderr_tail.start.assert_called_once()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -337,6 +337,70 @@ class ScheduleTest(MultiProcContinuousTest):
|
||||
if self.rank == self.world_size - 1:
|
||||
self.assertTrue(len(losses) > 0, "Losses should be computed during eval()")
|
||||
|
||||
@requires_accelerator_dist_backend(["nccl", "xccl"])
|
||||
@skip_but_pass_in_sandcastle_if(
|
||||
not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
|
||||
)
|
||||
@parametrize(
|
||||
"ScheduleClass",
|
||||
[
|
||||
ScheduleGPipe,
|
||||
Schedule1F1B,
|
||||
ScheduleInterleaved1F1B,
|
||||
ScheduleLoopedBFS,
|
||||
ScheduleInterleavedZeroBubble,
|
||||
],
|
||||
)
|
||||
def test_return_output(self, ScheduleClass):
|
||||
num_microbatches = 4
|
||||
if ScheduleClass in [
|
||||
ScheduleInterleaved1F1B,
|
||||
ScheduleLoopedBFS,
|
||||
ScheduleInterleavedZeroBubble,
|
||||
]:
|
||||
# Multi-stage schedules
|
||||
stages_per_rank = 2
|
||||
n_stages = stages_per_rank * self.world_size
|
||||
mod, _, x, target, loss_fn = setup_models_and_data(
|
||||
self.config, n_layers=n_stages
|
||||
)
|
||||
|
||||
# Create multi-stage pipeline
|
||||
stages, stage_modules, _ = create_multi_stage_pipeline(
|
||||
self.config, mod, stages_per_rank, n_stages
|
||||
)
|
||||
schedule = ScheduleClass(
|
||||
stages,
|
||||
num_microbatches,
|
||||
loss_fn=loss_fn,
|
||||
scale_grads=False,
|
||||
)
|
||||
else:
|
||||
# Single-stage schedules
|
||||
mod, _, x, target, loss_fn = setup_models_and_data(self.config)
|
||||
|
||||
# Create single-stage pipeline
|
||||
stage, stage_module, _ = create_single_stage_pipeline(
|
||||
self.config, mod, x, num_microbatches
|
||||
)
|
||||
schedule = ScheduleClass(
|
||||
stage,
|
||||
num_microbatches,
|
||||
loss_fn=loss_fn,
|
||||
scale_grads=False,
|
||||
)
|
||||
|
||||
losses = []
|
||||
|
||||
if self.rank == self.world_size - 1:
|
||||
output = schedule.step(target=target, losses=losses, return_outputs=False)
|
||||
else:
|
||||
schedule.step(x)
|
||||
|
||||
# Verify that output is None
|
||||
if self.rank == self.world_size - 1:
|
||||
self.assertTrue(output is None, "Output should be None")
|
||||
|
||||
@requires_accelerator_dist_backend(["nccl", "xccl"])
|
||||
@skip_but_pass_in_sandcastle_if(
|
||||
not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
|
||||
|
||||
@ -15,7 +15,7 @@ from torch.testing._internal.common_utils import (
|
||||
TestCase,
|
||||
)
|
||||
from torch.testing._internal.distributed.fake_pg import FakeStore
|
||||
from torch.utils._debug_mode import DebugMode
|
||||
from torch.utils._debug_mode import _OpCall, _RedistributeCall, DebugMode
|
||||
from torch.utils._python_dispatch import TorchDispatchMode
|
||||
|
||||
|
||||
@ -60,6 +60,10 @@ class TestDTensorDebugMode(TestCase):
|
||||
aten::sum(t: f32[1, 32])""",
|
||||
)
|
||||
|
||||
self.assertTrue(isinstance(debug_mode.operators[0], _OpCall))
|
||||
self.assertTrue(isinstance(debug_mode.operators[2], _RedistributeCall))
|
||||
self.assertEqual(next(iter(debug_mode.operators[1])), torch.ops.aten.mm.default)
|
||||
|
||||
def test_debug_string_inside_context(self):
|
||||
mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
|
||||
|
||||
|
||||
@ -6,7 +6,10 @@ import unittest
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import torch.fx.traceback as fx_traceback
|
||||
from torch._dynamo.functional_export import _dynamo_graph_capture_for_export
|
||||
from torch._dynamo.functional_export import (
|
||||
_dynamo_graph_capture_for_export,
|
||||
dynamo_graph_capture_for_export,
|
||||
)
|
||||
from torch._functorch.aot_autograd import aot_export_joint_with_descriptors
|
||||
from torch._functorch.partitioners import min_cut_rematerialization_partition
|
||||
from torch._guards import tracing, TracingContext
|
||||
@ -96,6 +99,13 @@ def strict_export_and_aot_export_joint_with_descriptors(model, inputs):
|
||||
return aot_export_joint_with_descriptors_alone(ep.module(), inputs)
|
||||
|
||||
|
||||
def graph_capture_and_aot_export_joint_with_descriptors_v2(model, inputs):
|
||||
gm = dynamo_graph_capture_for_export(model)(inputs)
|
||||
fake_mode = gm.meta.get("fake_mode", None)
|
||||
with tracing(TracingContext(fake_mode)):
|
||||
return aot_export_joint_with_descriptors_alone(gm, inputs)
|
||||
|
||||
|
||||
def graph_capture_and_aot_export_joint_with_descriptors(model, inputs):
|
||||
with torch._dynamo.config.patch(install_free_tensors=True):
|
||||
# TODO: switch to use the official graph_capture API once it is ready
|
||||
@ -288,6 +298,7 @@ class DTensorExportTest(TestCase):
|
||||
@parametrize(
|
||||
"export_fn",
|
||||
[
|
||||
graph_capture_and_aot_export_joint_with_descriptors_v2,
|
||||
graph_capture_and_aot_export_joint_with_descriptors,
|
||||
aot_export_joint_with_descriptors_alone,
|
||||
],
|
||||
@ -307,7 +318,21 @@ class DTensorExportTest(TestCase):
|
||||
def test_annotate_aot_export_joint_with_descriptors_alone(self):
|
||||
self._run_test(aot_export_joint_with_descriptors_alone, True)
|
||||
|
||||
def test_dynamic_shapes(self):
|
||||
@parametrize(
|
||||
"export_fn_with_answer",
|
||||
[
|
||||
(
|
||||
graph_capture_and_aot_export_joint_with_descriptors_v2,
|
||||
"[[4, 10], [4], [10, 4], [10], [4, 10], [4], [10, 4], [10], [s64, 10], [s64, 10]]",
|
||||
),
|
||||
(
|
||||
graph_capture_and_aot_export_joint_with_descriptors,
|
||||
"[[4, 10], [4], [10, 4], [10], [s22, 10], [s22, 10]]",
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_dynamic_shapes(self, export_fn_with_answer):
|
||||
export_fn, answer = export_fn_with_answer
|
||||
dp_degree = 2
|
||||
tp_degree = self.world_size // dp_degree
|
||||
|
||||
@ -331,7 +356,7 @@ class DTensorExportTest(TestCase):
|
||||
inputs = distribute_tensor(inputs, mesh_2d["tp"], placements=[Replicate()])
|
||||
torch._dynamo.mark_dynamic(inputs, 0, min=5, max=100)
|
||||
|
||||
joint_gm = graph_capture_and_aot_export_joint_with_descriptors(tp_model, inputs)
|
||||
joint_gm = export_fn(tp_model, inputs)
|
||||
|
||||
res = []
|
||||
for node in joint_gm.graph.nodes:
|
||||
@ -341,12 +366,16 @@ class DTensorExportTest(TestCase):
|
||||
if isinstance(fake_val, torch._subclasses.fake_tensor.FakeTensor):
|
||||
res.append(list(fake_val.shape))
|
||||
|
||||
self.assertExpectedInline(
|
||||
str(res),
|
||||
"""[[4, 10], [4], [10, 4], [10], [s22, 10], [s22, 10]]""",
|
||||
)
|
||||
self.assertEqual(str(res), answer)
|
||||
|
||||
def test_einsum_dtensor_export(self):
|
||||
@parametrize(
|
||||
"export_fn",
|
||||
[
|
||||
dynamo_graph_capture_for_export,
|
||||
_dynamo_graph_capture_for_export,
|
||||
],
|
||||
)
|
||||
def test_einsum_dtensor_export(self, export_fn):
|
||||
"""Test exporting a model with einsum that has DTensor inputs/outputs with side effects"""
|
||||
world_size = 4
|
||||
# Create device mesh
|
||||
@ -366,9 +395,7 @@ class DTensorExportTest(TestCase):
|
||||
output = model(x_dtensor, y_dtensor, z_dtensor)
|
||||
with torch._dynamo.config.patch(install_free_tensors=True):
|
||||
# TODO: switch to use the official graph_capture API once it is ready
|
||||
gm = _dynamo_graph_capture_for_export(model)(
|
||||
x_dtensor, y_dtensor, z_dtensor
|
||||
)
|
||||
gm = export_fn(model)(x_dtensor, y_dtensor, z_dtensor)
|
||||
output_gm = gm(x_dtensor, y_dtensor, z_dtensor)
|
||||
self.assertEqual(output, output_gm)
|
||||
|
||||
|
||||
@ -55,7 +55,7 @@ if TEST_WITH_DEV_DBG_ASAN:
|
||||
|
||||
# load_tests from common_utils is used to automatically filter tests for
|
||||
# sharding on sandcastle. This line silences flake warnings
|
||||
load_tests = load_tests
|
||||
load_tests = load_tests # noqa: PLW0127
|
||||
|
||||
if platform == "darwin":
|
||||
LOOPBACK = "lo0"
|
||||
|
||||
@ -1459,7 +1459,7 @@ class ProcessGroupGlooTest(MultiProcessTestCase):
|
||||
@requires_gloo()
|
||||
def test_reduce_checks(self):
|
||||
store = c10d.FileStore(self.file_name, self.world_size)
|
||||
pg = pg = self._create_process_group_gloo(
|
||||
pg = self._create_process_group_gloo(
|
||||
store, self.rank, self.world_size, self.opts()
|
||||
)
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user