Compare commits

..

7 Commits

Author SHA1 Message Date
ec8c59a6ad Update on "[dynamo][pytree][compile time] Specialize tree_is_leaf"
cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx chenyang78 kadeng chauhang amjames Lucaskabela

[ghstack-poisoned]
2025-11-18 11:59:01 -08:00
b6dcd5186e Update base for Update on "[dynamo][pytree][compile time] Specialize tree_is_leaf"
cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx chenyang78 kadeng chauhang amjames Lucaskabela

[ghstack-poisoned]
2025-11-18 11:59:01 -08:00
6406a010a8 [dynamo][pytree][compile time] Specialize tree_is_leaf
[ghstack-poisoned]
2025-11-17 23:05:14 -08:00
cc5a1c93de Update on "[dynamo][compile time] Special case for torch.utils._pytree._get_node_type"
<Replace this line with a title. Use 1 line only, 67 chars or less>

Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:

cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx chenyang78 kadeng chauhang amjames Lucaskabela

[ghstack-poisoned]
2025-11-17 22:16:25 -08:00
dd3575139b Update on "[dynamo][compile time] Special case for torch.utils._pytree._get_node_type"
<Replace this line with a title. Use 1 line only, 67 chars or less>

Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:

cc voznesenskym penguinwu EikanWang jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx chenyang78 kadeng chauhang amjames Lucaskabela

[ghstack-poisoned]
2025-11-17 22:10:35 -08:00
1a55a4777c [dynamo][compile time] Special case for torch.utils._pytree._get_node_type
<Replace this line with a title. Use 1 line only, 67 chars or less>

Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:

[ghstack-poisoned]
2025-11-17 16:21:20 -08:00
456833f78a [pytree][compile] Slightly faster TreeSpec init
Helps with reducing Dynamo tracing time. Earlier the generator object
would cause more polyfills.

[ghstack-poisoned]
2025-11-17 11:39:41 -08:00
39 changed files with 577 additions and 416 deletions

View File

@ -125,10 +125,10 @@ case "$tag" in
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks)
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
CUDA_VERSION=12.8.1
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=11
GCC_VERSION=9
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
@ -146,6 +146,16 @@ case "$tag" in
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
;;
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9)
CUDA_VERSION=12.8.1
ANACONDA_PYTHON_VERSION=3.10
GCC_VERSION=9
VISION=yes
KATEX=yes
UCX_COMMIT=${_UCX_COMMIT}
UCC_COMMIT=${_UCC_COMMIT}
TRITON=yes
;;
pytorch-linux-jammy-py3-clang12-onnx)
ANACONDA_PYTHON_VERSION=3.10
CLANG_VERSION=12

View File

@ -23,7 +23,7 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
with:
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: '8.0 9.0'
test-matrix: |
@ -39,7 +39,7 @@ jobs:
needs: attn-microbenchmark-build
with:
timeout-minutes: 500
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
docker-image: ${{ needs.attn-microbenchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.attn-microbenchmark-build.outputs.test-matrix }}
secrets: inherit
@ -51,7 +51,7 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
with:
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: '10.0'
test-matrix: |
@ -66,7 +66,7 @@ jobs:
needs: opmicrobenchmark-build-b200
with:
timeout-minutes: 500
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
docker-image: ${{ needs.opmicrobenchmark-build-b200.outputs.docker-image }}
test-matrix: ${{ needs.opmicrobenchmark-build-b200.outputs.test-matrix }}
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only

View File

@ -52,7 +52,8 @@ jobs:
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11,
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks,
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
pytorch-linux-jammy-py3.10-clang12,
pytorch-linux-jammy-py3.11-clang12,

View File

@ -50,10 +50,9 @@ jobs:
matrix:
runner: [linux.rocm.gfx942.docker-cache]
docker-image: [
"${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3 }}"
#"${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3 }}",
#"${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-noble-rocm-n-py3 }}",
#"${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3-benchmarks }}"
"${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3 }}",
"${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-noble-rocm-n-py3 }}",
"${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3-benchmarks }}"
]
runs-on: "${{ matrix.runner }}"
steps:

View File

@ -30,14 +30,14 @@ jobs:
opt_out_experiments: lf
build:
name: cuda12.8-py3.10-gcc11-sm80
name: cuda12.8-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-build.yml
needs:
- get-default-label-prefix
with:
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0'
test-matrix: |
{ include: [
@ -46,11 +46,11 @@ jobs:
secrets: inherit
test:
name: cuda12.8-py3.10-gcc11-sm80
name: cuda12.8-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-test.yml
needs: build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
docker-image: ${{ needs.build.outputs.docker-image }}
test-matrix: ${{ needs.build.outputs.test-matrix }}
timeout-minutes: 720

View File

@ -27,14 +27,14 @@ jobs:
opt_out_experiments: lf
build:
name: cuda12.8-py3.10-gcc11-sm80
name: cuda12.8-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-build.yml
needs:
- get-default-label-prefix
with:
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0'
test-matrix: |
{ include: [
@ -47,11 +47,11 @@ jobs:
secrets: inherit
test:
name: cuda12.8-py3.10-gcc11-sm80
name: cuda12.8-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-test.yml
needs: build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
docker-image: ${{ needs.build.outputs.docker-image }}
test-matrix: ${{ needs.build.outputs.test-matrix }}
# disable monitor in perf tests for more investigation

View File

@ -80,7 +80,7 @@ jobs:
opt_out_experiments: lf
build:
name: cuda12.8-py3.10-gcc11-sm100
name: cuda12.8-py3.10-gcc9-sm100
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
@ -90,8 +90,8 @@ jobs:
# from trunk. Also use a memory-intensive runner here because memory is
# usually the bottleneck
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '10.0'
test-matrix: |
{ include: [
@ -104,12 +104,12 @@ jobs:
secrets: inherit
test-periodically:
name: cuda12.8-py3.10-gcc11-sm100
name: cuda12.8-py3.10-gcc9-sm100
uses: ./.github/workflows/_linux-test.yml
needs: build
if: github.event.schedule == '0 7 * * 1-6'
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
docker-image: ${{ needs.build.outputs.docker-image }}
test-matrix: ${{ needs.build.outputs.test-matrix }}
@ -121,12 +121,12 @@ jobs:
secrets: inherit
test-weekly:
name: cuda12.8-py3.10-gcc11-sm100
name: cuda12.8-py3.10-gcc9-sm100
uses: ./.github/workflows/_linux-test.yml
needs: build
if: github.event.schedule == '0 7 * * 0'
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
docker-image: ${{ needs.build.outputs.docker-image }}
test-matrix: ${{ needs.build.outputs.test-matrix }}
@ -138,11 +138,11 @@ jobs:
secrets: inherit
test:
name: cuda12.8-py3.10-gcc11-sm100
name: cuda12.8-py3.10-gcc9-sm100
uses: ./.github/workflows/_linux-test.yml
needs: build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
docker-image: ${{ needs.build.outputs.docker-image }}
test-matrix: ${{ needs.build.outputs.test-matrix }}

View File

@ -95,8 +95,8 @@ jobs:
# from trunk. Also use a memory-intensive runner here because memory is
# usually the bottleneck
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '9.0'
test-matrix: |
{ include: [
@ -132,7 +132,7 @@ jobs:
needs: build
if: github.event.schedule == '15 0 * * 1-6'
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
docker-image: ${{ needs.build.outputs.docker-image }}
test-matrix: ${{ needs.build.outputs.test-matrix }}
@ -149,7 +149,7 @@ jobs:
needs: build
if: github.event.schedule == '0 7 * * 0'
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
docker-image: ${{ needs.build.outputs.docker-image }}
test-matrix: ${{ needs.build.outputs.test-matrix }}
@ -168,7 +168,7 @@ jobs:
# needs one round of benchmark
if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' }}
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
dashboard-tag: training-${{ inputs.training || 'true' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cudagraphs-${{ inputs.cudagraphs || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'false' }}-aotinductor-${{ inputs.aotinductor || 'false' }}-maxautotune-${{ inputs.maxautotune || 'false' }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs || 'false' }}-cudagraphs_low_precision-${{ inputs.cudagraphs || 'false' }}
docker-image: ${{ needs.build.outputs.docker-image }}
test-matrix: ${{ needs.build.outputs.test-matrix }}

View File

@ -80,15 +80,15 @@ jobs:
opt_out_experiments: lf
build:
name: cuda12.8-py3.10-gcc11-sm80
name: cuda12.8-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
# Every bit to make perf run faster helps
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0'
test-matrix: |
{ include: [
@ -117,12 +117,12 @@ jobs:
secrets: inherit
test-nightly:
name: cuda12.8-py3.10-gcc11-sm80
name: cuda12.8-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-test.yml
needs: build
if: github.event.schedule == '0 7 * * 1-6'
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
docker-image: ${{ needs.build.outputs.docker-image }}
test-matrix: ${{ needs.build.outputs.test-matrix }}
@ -133,12 +133,12 @@ jobs:
secrets: inherit
test-weekly:
name: cuda12.8-py3.10-gcc11-sm80
name: cuda12.8-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-test.yml
needs: build
if: github.event.schedule == '0 7 * * 0'
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
docker-image: ${{ needs.build.outputs.docker-image }}
test-matrix: ${{ needs.build.outputs.test-matrix }}
@ -150,12 +150,12 @@ jobs:
secrets: inherit
test:
name: cuda12.8-py3.10-gcc11-sm80
name: cuda12.8-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-test.yml
needs: build
if: github.event_name == 'workflow_dispatch'
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
docker-image: ${{ needs.build.outputs.docker-image }}
test-matrix: ${{ needs.build.outputs.test-matrix }}

View File

@ -37,8 +37,8 @@ jobs:
needs: get-default-label-prefix
with:
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0;8.6'
test-matrix: |
{ include: [
@ -76,7 +76,7 @@ jobs:
uses: ./.github/workflows/_linux-test.yml
needs: periodic-dynamo-benchmarks-build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
docker-image: ${{ needs.periodic-dynamo-benchmarks-build.outputs.docker-image }}
test-matrix: ${{ needs.periodic-dynamo-benchmarks-build.outputs.test-matrix }}
secrets: inherit
@ -138,8 +138,8 @@ jobs:
- get-default-label-prefix
with:
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0'
test-matrix: |
{ include: [
@ -153,7 +153,7 @@ jobs:
uses: ./.github/workflows/_linux-test.yml
needs: inductor-smoke-build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
docker-image: ${{ needs.inductor-smoke-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-smoke-build.outputs.test-matrix }}
secrets: inherit

View File

@ -33,8 +33,8 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.6'
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
test-matrix: |
@ -52,7 +52,7 @@ jobs:
uses: ./.github/workflows/_linux-test.yml
needs: inductor-build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
secrets: inherit

View File

@ -49,8 +49,8 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.6'
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
test-matrix: |
@ -69,7 +69,7 @@ jobs:
uses: ./.github/workflows/_linux-test.yml
needs: inductor-build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
secrets: inherit

View File

@ -25,7 +25,7 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
with:
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: '8.0 9.0'
test-matrix: |
@ -41,7 +41,7 @@ jobs:
needs: opmicrobenchmark-build
with:
timeout-minutes: 500
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
docker-image: ${{ needs.opmicrobenchmark-build.outputs.docker-image }}
test-matrix: ${{ needs.opmicrobenchmark-build.outputs.test-matrix }}
secrets: inherit
@ -53,7 +53,7 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
with:
runner: linux.12xlarge.memory
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: '10.0'
test-matrix: |
@ -68,7 +68,7 @@ jobs:
needs: opmicrobenchmark-build-b200
with:
timeout-minutes: 500
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
docker-image: ${{ needs.opmicrobenchmark-build-b200.outputs.docker-image }}
test-matrix: ${{ needs.opmicrobenchmark-build-b200.outputs.test-matrix }}
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only

View File

@ -90,7 +90,6 @@ jobs:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-cuda12.8-py3.10-gcc11
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
cuda-arch-list: 8.6
test-matrix: |
{ include: [
{ config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
@ -98,9 +97,7 @@ jobs:
{ config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
{ config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
{ config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
{ config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
{ config: "multigpu", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu", owners: ["oncall:distributed"] },
{ config: "multigpu", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu", owners: ["oncall:distributed"] },
{ config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
]}
secrets: inherit
@ -116,14 +113,40 @@ jobs:
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc11-debug-build:
name: linux-jammy-cuda12.8-py3.10-gcc11-debug
linux-jammy-cuda12_8-py3_10-gcc9-build:
name: linux-jammy-cuda12.8-py3.10-gcc9
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-debug
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
build-environment: linux-jammy-cuda12.8-py3.10-gcc9
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9
cuda-arch-list: 8.6
test-matrix: |
{ include: [
{ config: "multigpu", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu", owners: ["oncall:distributed"] },
{ config: "multigpu", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu", owners: ["oncall:distributed"] },
]}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc9-test:
name: linux-jammy-cuda12.8-py3.10-gcc9
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-cuda12_8-py3_10-gcc9-build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc9
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc9-debug-build:
name: linux-jammy-cuda12.8-py3.10-gcc9-debug
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-debug
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9
cuda-arch-list: 8.9
test-matrix: |
{ include: [
@ -137,16 +160,16 @@ jobs:
]}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc11-debug-test:
name: linux-jammy-cuda12.8-py3.10-gcc11-debug
linux-jammy-cuda12_8-py3_10-gcc9-debug-test:
name: linux-jammy-cuda12.8-py3.10-gcc9-debug
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-jammy-cuda12_8-py3_10-gcc11-debug-build
- linux-jammy-cuda12_8-py3_10-gcc9-debug-build
- target-determination
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-debug
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-debug-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-debug-build.outputs.test-matrix }}
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-debug
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-debug-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-debug-build.outputs.test-matrix }}
secrets: inherit
linux-jammy-cuda13_0-py3_10-gcc11-build:

View File

@ -318,14 +318,14 @@ jobs:
]}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc11-inductor-build:
name: cuda12.8-py3.10-gcc11-sm75
linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
name: cuda12.8-py3.10-gcc9-sm75
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm75
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '7.5'
test-matrix: |
{ include: [
@ -333,14 +333,14 @@ jobs:
]}
secrets: inherit
linux-jammy-cuda12_8-py3_10-gcc11-inductor-test:
name: cuda12.8-py3.10-gcc11-sm75
linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
name: cuda12.8-py3.10-gcc9-sm75
uses: ./.github/workflows/_linux-test.yml
needs: linux-jammy-cuda12_8-py3_10-gcc11-inductor-build
needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm75
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-inductor-build.outputs.test-matrix }}
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
secrets: inherit
linux-noble-xpu-n-py3_10-build:

View File

@ -26,14 +26,14 @@ jobs:
curr_ref_type: ${{ github.ref_type }}
build:
name: cuda12.8-py3.10-gcc11-sm80
name: cuda12.8-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-build.yml
needs:
- get-default-label-prefix
with:
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0'
test-matrix: |
{ include: [
@ -42,11 +42,11 @@ jobs:
secrets: inherit
test:
name: cuda12.8-py3.10-gcc11-sm80
name: cuda12.8-py3.10-gcc9-sm80
uses: ./.github/workflows/_linux-test.yml
needs: build
with:
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
docker-image: ${{ needs.build.outputs.docker-image }}
test-matrix: ${{ needs.build.outputs.test-matrix }}
secrets: inherit

View File

@ -231,8 +231,8 @@ jobs:
uses: ./.github/workflows/_linux-build.yml
needs: get-label-type
with:
build-environment: linux-jammy-cuda12.8-py3.12-gcc11-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm80
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
cuda-arch-list: '8.0'
secrets: inherit

View File

@ -813,43 +813,8 @@ void smooth_l1_kernel(TensorIteratorBase& iter, double beta) {
}
void huber_kernel(TensorIterator& iter, double delta) {
// Special-case kHalf: compute in float for numerical stability
if (iter.dtype() == kHalf) {
const float delta_val(static_cast<float>(delta));
const Vectorized<float> delta_vec(static_cast<float>(delta));
const Vectorized<float> point_five_vec(static_cast<float>(0.5));
cpu_kernel_vec(
iter,
// scalar lambda: convert half -> float, compute in float, cast back to half
[&delta_val] (at::Half a, at::Half b) -> at::Half {
float af = static_cast<float>(a);
float bf = static_cast<float>(b);
float z = std::abs(af - bf);
float out = z < delta_val
? 0.5f * z * z
: delta_val * (z - 0.5f * delta_val);
return static_cast<at::Half>(out);
},
[&delta_vec, &point_five_vec] (Vectorized<Half> a, Vectorized<Half> b) {
auto [a0, a1] = convert_half_float(a);
auto [b0, b1] = convert_half_float(b);
auto z = (a0 - b0).abs();
a0 = Vectorized<float>::blendv(
point_five_vec * z * z,
delta_vec * (z - point_five_vec * delta_vec),
z >= delta_vec);
z = (a1 - b1).abs();
a1 = Vectorized<float>::blendv(
point_five_vec * z * z,
delta_vec * (z - point_five_vec * delta_vec),
z >= delta_vec);
return convert_float_half(a0, a1);
}
);
return;
}
else {
AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16, iter.dtype(), "huber_cpu", [&]() {
AT_DISPATCH_FLOATING_TYPES_AND2(
kBFloat16, kHalf, iter.dtype(), "huber_cpu", [&]() {
using Vec = Vectorized<scalar_t>;
const scalar_t delta_val(delta);
const Vec delta_val_vec(delta_val);
@ -870,7 +835,6 @@ void huber_kernel(TensorIterator& iter, double delta) {
z >= delta_val_vec);
});
});
}
}
void sigmoid_backward_kernel(TensorIteratorBase& iter) {

View File

@ -147,19 +147,6 @@ class MetalShaderLibrary {
const std::optional<c10::Scalar> alpha = std::nullopt,
const std::optional<c10::ScalarType> scalar_arg_type = std::nullopt);
template <typename T>
void exec_unary_kernel_with_params(
TensorIteratorBase& iter,
const std::string& name,
T params,
const std::string& params_type_name);
template <typename T>
void exec_binary_kernel_with_params(
TensorIteratorBase& iter,
const std::string& name,
T params,
const std::string& params_type_name);
protected:
virtual MTLLibrary_t getLibrary();
virtual MTLLibrary_t getLibrary(

View File

@ -7,12 +7,10 @@
#include <ATen/Tensor.h>
#include <ATen/TensorIterator.h>
#include <ATen/Utils.h>
#include <ATen/mps/MPSProfiler.h>
#include <ATen/mps/MPSStream.h>
#include <ATen/native/mps/MetalShaderLibrary.h>
#include <ATen/native/mps/TensorFactory.h>
#include <c10/core/ScalarType.h>
#include <fmt/format.h>
#include <torch/library.h>
#include <unordered_map>
@ -632,147 +630,4 @@ inline bool needsGather(const TensorBase& t) {
return !is_macOS_15_0_or_newer && (!t.is_contiguous() || t.storage_offset());
}
template <typename T>
void MetalShaderLibrary::exec_unary_kernel_with_params(TensorIteratorBase& iter,
const std::string& name,
T params,
const std::string& params_type_name) {
using namespace at::mps;
// Decompose 64-bit tensor into 32-bit ones
if (!iter.can_use_32bit_indexing()) {
for (auto&& sub_iter : iter.with_32bit_indexing()) {
exec_unary_kernel_with_params(sub_iter, name, params, params_type_name);
}
return;
}
auto inputTensor = iter.input(0);
auto outputTensor = iter.output(0);
uint32_t length = iter.numel();
if (length == 0) {
return;
}
auto kernel_name = fmt::format("{}_{}_{}_{}{}",
name,
iter.is_contiguous() ? "dense" : "strided",
scalarToMetalTypeString(outputTensor),
scalarToMetalTypeString(inputTensor),
fmt::format("_{}", params_type_name));
@autoreleasepool {
auto cplState = getPipelineStateForFunc(kernel_name);
MPSStream* mpsStream = getCurrentMPSStream();
dispatch_sync(mpsStream->queue(), ^() {
auto computeEncoder = mpsStream->commandEncoder();
getMPSProfiler().beginProfileKernel(cplState, name, {inputTensor});
[computeEncoder setComputePipelineState:cplState];
bind_iter_tensors(computeEncoder, iter);
if (!iter.is_contiguous()) {
mtl_setArgs<2>(computeEncoder,
outputTensor.sizes(),
inputTensor.strides(),
outputTensor.strides(),
inputTensor.ndimension());
}
detail::mtl_setArg(computeEncoder, params, iter.is_contiguous() ? 2 : 6);
mtl_dispatch1DJob(computeEncoder, cplState, length);
getMPSProfiler().endProfileKernel(cplState);
});
}
}
template <typename T>
void MetalShaderLibrary::exec_binary_kernel_with_params(TensorIteratorBase& iter,
const std::string& name,
T params,
const std::string& params_type_name) {
using namespace mps;
// TODO: Figure a better place to downcast double scalars (probably in tensor iterator itself?)
// Right now running something like 1.0-torch.rand(5, device='mps') will create iterator with
// double as common dtype (because Python floating point are always 64-bit values)
TORCH_CHECK(iter.output().scalar_type() != at::kDouble, "float64 is not supported on MPS");
// Skip for empty iterators
if (iter.numel() == 0) {
return;
}
// Decompose 64-bit tensor into 32-bit ones
if (!iter.can_use_32bit_indexing()) {
for (auto&& sub_iter : iter.with_32bit_indexing()) {
exec_binary_kernel_with_params(sub_iter, name, params, params_type_name);
}
return;
}
auto convert_double_scalar = [](Tensor& t) {
if (t.dim() != 0) {
return;
}
if (t.scalar_type() == kDouble) {
t = t.to(kFloat);
} else if (t.scalar_type() == kComplexDouble) {
t = t.to(kComplexFloat);
}
};
Tensor input = iter.input(0);
Tensor other = iter.input(1);
Tensor out = iter.output();
convert_double_scalar(input);
convert_double_scalar(other);
MPSStream* mpsStream = getCurrentMPSStream();
const auto cast_needed = input.scalar_type() != other.scalar_type();
const auto suffix = iter.is_contiguous() ? "dense" : "strided";
// TODO: Implicitly pass both input and output types to non-cast kernels
const auto kernel_name = cast_needed
? fmt::format("{}_{}_cast_{}_{}", name, suffix, scalarToMetalTypeString(out), params_type_name)
: fmt::format("{}_{}_{}_{}_{}",
name,
suffix,
scalarToMetalTypeString(out),
scalarToMetalTypeString(input),
params_type_name);
dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
@autoreleasepool {
auto computeEncoder = mpsStream->commandEncoder();
auto binaryPSO = getPipelineStateForFunc(kernel_name);
// this function call is a no-op if MPS Profiler is not enabled
getMPSProfiler().beginProfileKernel(binaryPSO, kernel_name, {input, other});
[computeEncoder setComputePipelineState:binaryPSO];
// Set input and output tensors
bind_iter_tensors(computeEncoder, iter);
// Iterator is contiguous if all of its elements are dense in storage,
// i.e. it's true for both row-first and column-first tensors
if (iter.is_contiguous()) {
detail::mtl_setArg(computeEncoder, params, 3);
if (cast_needed) {
std::array<int, 4> size_and_types = {static_cast<int>(c10::elementSize(input.scalar_type())),
static_cast<int>(c10::elementSize(other.scalar_type())),
static_cast<int>(input.scalar_type()),
static_cast<int>(other.scalar_type())};
mtl_setBytes(computeEncoder, size_and_types, 4);
}
} else {
// Please note that shapes and strides of the iterator might be
// different than that of its operands, for example binary op
// between 4x4 tensor and scalar will result in 1D 16 element iterator
std::array<int, 4> ndim_and_types = {iter.ndim(),
static_cast<int>(input.scalar_type()),
static_cast<int>(other.scalar_type()),
static_cast<int>(out.scalar_type())};
mtl_setArgs<3>(
computeEncoder, params, iter.shape(), iter.strides(0), iter.strides(1), iter.strides(2), ndim_and_types);
}
mtl_dispatch1DJob(computeEncoder, binaryPSO, iter.numel());
getMPSProfiler().endProfileKernel(binaryPSO);
}
});
}
} // namespace at::native::mps

View File

@ -1,16 +0,0 @@
#pragma once
template <typename T>
struct ELUParams {
T alpha;
T scale;
T input_scale;
};
template <typename T>
struct ELUBackwardParams {
T alpha;
T scale;
T input_scale;
bool is_result;
};

View File

@ -1,4 +1,3 @@
#include <ATen/native/mps/kernels/Activation.h>
#include <c10/metal/indexing.h>
#include <c10/metal/special_math.h>
#include <metal_stdlib>
@ -100,59 +99,6 @@ REGISTER_BINARY_OP(hardswish_backward, float, float);
REGISTER_BINARY_OP(hardswish_backward, half, half);
REGISTER_BINARY_OP(hardswish_backward, bfloat, bfloat);
struct elu_functor {
template <typename T>
inline T operator()(const T self_, const ELUParams<T> params) {
using op_T = opmath_t<T>;
auto alpha = static_cast<op_T>(params.alpha);
auto scale = static_cast<op_T>(params.scale);
auto input_scale = static_cast<op_T>(params.input_scale);
auto self = static_cast<op_T>(self_);
auto neg_res = alpha * (::metal::precise::exp(self * input_scale) - 1);
return static_cast<T>(scale * (self < 0 ? neg_res : self));
}
};
struct elu_backward_functor {
template <typename T>
inline T operator()(
const T grad_output_,
const T self_,
ELUBackwardParams<T> params) {
using op_T = opmath_t<T>;
auto alpha = static_cast<op_T>(params.alpha);
auto scale = static_cast<op_T>(params.scale);
auto input_scale = static_cast<op_T>(params.input_scale);
auto grad_output = static_cast<op_T>(grad_output_);
auto self = static_cast<op_T>(self_);
if (params.is_result) {
auto neg_coef = input_scale * (self + alpha * scale);
return static_cast<T>(grad_output * (self <= 0 ? neg_coef : scale));
} else {
auto neg_coef = input_scale * alpha * scale *
::metal::precise::exp(self * input_scale);
return static_cast<T>(grad_output * (self <= 0 ? neg_coef : scale));
}
}
};
#define REGISTER_ELU_OP(T) \
typedef ELUParams<T> ELUParams_##T; \
REGISTER_UNARY_ALPHA_OP(elu, T, ELUParams_##T, T);
REGISTER_ELU_OP(float);
REGISTER_ELU_OP(half);
REGISTER_ELU_OP(bfloat);
#define REGISTER_ELU_BACKWARD_OP(T) \
typedef ELUBackwardParams<T> ELUBackwardParams_##T; \
REGISTER_BINARY_ALPHA_OP(elu_backward, T, ELUBackwardParams_##T, T);
REGISTER_ELU_BACKWARD_OP(float);
REGISTER_ELU_BACKWARD_OP(half);
REGISTER_ELU_BACKWARD_OP(bfloat);
struct leaky_relu_functor {
template <typename T>
inline T operator()(const T x, const T negative_slope) {

View File

@ -11,6 +11,8 @@
#include <ATen/ops/_log_softmax_native.h>
#include <ATen/ops/_prelu_kernel_backward_native.h>
#include <ATen/ops/_prelu_kernel_native.h>
#include <ATen/ops/elu_backward_native.h>
#include <ATen/ops/elu_native.h>
#include <ATen/ops/gelu_backward_native.h>
#include <ATen/ops/gelu_native.h>
#include <ATen/ops/glu_backward_native.h>
@ -696,6 +698,194 @@ TORCH_IMPL_FUNC(gelu_backward_out_mps)
}
}
static void elu_variants_out_mps(const Tensor& self,
const Scalar& alpha,
const Scalar& scale,
const Scalar& input_scale,
const Tensor& result,
std::string func_name) {
using namespace mps;
using CachedGraph = MPSUnaryCachedGraph;
auto resultMemFormat = result.suggest_memory_format();
bool executeGatherOp = !(self.is_contiguous(resultMemFormat) && result.is_contiguous(resultMemFormat));
Tensor out;
if (executeGatherOp) {
out = at::empty_like(result, MemoryFormat::Contiguous);
}
// Empty output
if (result.numel() == 0) {
return;
}
MPSStream* stream = getCurrentMPSStream();
@autoreleasepool {
std::string key = func_name + ":" + getTensorsStringKey({self}) + ":" + std::to_string(alpha.to<double>()) + ":" +
std::to_string(scale.to<double>()) + ":" + std::to_string(input_scale.to<double>());
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
// scale * (max(0, x) + min(0, alpha * (exp(input_scale * x) - 1) ))
MPSGraphTensor* alphaTensor = [mpsGraph constantWithScalar:alpha.to<double>()
shape:@[ @1 ]
dataType:getMPSDataType(self)];
MPSGraphTensor* inputScaleTensor = [mpsGraph constantWithScalar:input_scale.to<double>()
shape:@[ @1 ]
dataType:getMPSDataType(self)];
MPSGraphTensor* scaleTensor = [mpsGraph constantWithScalar:scale.to<double>()
shape:@[ @1 ]
dataType:getMPSDataType(self)];
MPSGraphTensor* unitTensor = [mpsGraph constantWithScalar:1.0f shape:@[ @1 ] dataType:getMPSDataType(self)];
MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0f shape:@[ @1 ] dataType:getMPSDataType(self)];
MPSGraphTensor* scaledInputTensor = [mpsGraph multiplicationWithPrimaryTensor:inputTensor
secondaryTensor:inputScaleTensor
name:nil];
MPSGraphTensor* exponentTensor = [mpsGraph exponentWithTensor:scaledInputTensor name:nil];
MPSGraphTensor* exponentMinusOneTensor = [mpsGraph subtractionWithPrimaryTensor:exponentTensor
secondaryTensor:unitTensor
name:nil];
MPSGraphTensor* alphaTimesTensor = [mpsGraph multiplicationWithPrimaryTensor:exponentMinusOneTensor
secondaryTensor:alphaTensor
name:nil];
MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:inputTensor
secondaryTensor:zeroTensor
name:nil];
MPSGraphTensor* fusedOutput = [mpsGraph selectWithPredicateTensor:predicateTensor
truePredicateTensor:inputTensor
falsePredicateTensor:alphaTimesTensor
name:nil];
MPSGraphTensor* outputTensor = [mpsGraph multiplicationWithPrimaryTensor:fusedOutput
secondaryTensor:scaleTensor
name:nil];
newCachedGraph->inputTensor_ = inputTensor;
newCachedGraph->outputTensor_ = outputTensor;
});
auto selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, nil, executeGatherOp);
auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out.has_storage() ? out : result, nil, false);
auto feeds = dictionaryFromPlaceholders(selfPlaceholder);
runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
if (out.has_storage()) {
result.copy_(out);
}
}
}
// scale * (max(0, x) + min(0, alpha * (exp(input_scale * x) - 1) ))
TORCH_IMPL_FUNC(elu_out_mps)
(const Tensor& self, const Scalar& alpha, const Scalar& scale, const Scalar& input_scale, const Tensor& result) {
elu_variants_out_mps(self, alpha, scale, input_scale, result, "elu_out_mps");
}
TORCH_IMPL_FUNC(elu_backward_out_mps)
(const Tensor& grad_output,
const Scalar& alpha,
const Scalar& scale,
const Scalar& input_scale,
bool is_result,
const Tensor& self_or_result,
const Tensor& grad_input) {
using namespace mps;
using CachedGraph = MPSUnaryGradCachedGraph;
auto gradMemFormat = grad_input.suggest_memory_format();
bool executeGatherOp = !(grad_output.is_contiguous(gradMemFormat) && self_or_result.is_contiguous(gradMemFormat) &&
grad_input.is_contiguous(gradMemFormat));
Tensor out;
if (executeGatherOp && gradMemFormat == MemoryFormat::ChannelsLast) {
out = at::empty_like(grad_input, MemoryFormat::Contiguous);
}
// Empty output
if (grad_input.numel() == 0) {
return;
}
MPSStream* stream = getCurrentMPSStream();
@autoreleasepool {
std::string key = "elu_backward_out_mps:" + getTensorsStringKey({grad_output, self_or_result}) + ":" +
std::to_string(alpha.to<double>()) + ":" + std::to_string(scale.to<double>()) + ":" +
std::to_string(input_scale.to<double>()) + ":" + std::to_string(is_result);
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
MPSGraphTensor* selfOrResultTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_or_result);
MPSGraphTensor* lessThanZeroGradTensor = nil;
if (is_result) {
MPSGraphTensor* alphaTensor = [mpsGraph constantWithScalar:alpha.to<double>()
shape:@[ @1 ]
dataType:getMPSDataType(grad_output)];
MPSGraphTensor* resultPlusAlphaTensor = [mpsGraph additionWithPrimaryTensor:selfOrResultTensor
secondaryTensor:alphaTensor
name:nil];
auto constMul = scale.to<double>() * input_scale.to<double>();
MPSGraphTensor* constMulTensor = [mpsGraph constantWithScalar:constMul
shape:@[ @1 ]
dataType:getMPSDataType(grad_output)];
lessThanZeroGradTensor = [mpsGraph multiplicationWithPrimaryTensor:resultPlusAlphaTensor
secondaryTensor:constMulTensor
name:nil];
} else {
MPSGraphTensor* inputScaleTensor = [mpsGraph constantWithScalar:input_scale.to<double>()
shape:@[ @1 ]
dataType:getMPSDataType(grad_output)];
MPSGraphTensor* scaledInputTensor = [mpsGraph multiplicationWithPrimaryTensor:selfOrResultTensor
secondaryTensor:inputScaleTensor
name:nil];
MPSGraphTensor* expTensor = [mpsGraph exponentWithTensor:scaledInputTensor name:nil];
auto constMul = scale.to<double>() * input_scale.to<double>() * alpha.to<double>();
MPSGraphTensor* constMulTensor = [mpsGraph constantWithScalar:constMul
shape:@[ @1 ]
dataType:getMPSDataType(grad_output)];
lessThanZeroGradTensor = [mpsGraph multiplicationWithPrimaryTensor:expTensor
secondaryTensor:constMulTensor
name:nil];
}
MPSGraphTensor* scaleTensor = [mpsGraph constantWithScalar:scale.to<double>()
shape:@[ @1 ]
dataType:getMPSDataType(grad_output)];
MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0f
shape:@[ @1 ]
dataType:getMPSDataType(grad_output)];
MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:selfOrResultTensor
secondaryTensor:zeroTensor
name:nil];
MPSGraphTensor* gradTensor = [mpsGraph selectWithPredicateTensor:predicateTensor
truePredicateTensor:scaleTensor
falsePredicateTensor:lessThanZeroGradTensor
name:nil];
MPSGraphTensor* gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor:gradTensor
secondaryTensor:gradOutputTensor
name:nil];
newCachedGraph->gradOutputTensor_ = gradOutputTensor;
newCachedGraph->inputTensor_ = selfOrResultTensor;
newCachedGraph->gradInputTensor_ = gradInputTensor;
});
Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output, nil, executeGatherOp);
Placeholder selfOrResultPlaceholder = Placeholder(cachedGraph->inputTensor_, self_or_result, nil, executeGatherOp);
Placeholder gradInputPlaceholder =
Placeholder(cachedGraph->gradInputTensor_, out.has_storage() ? out : grad_input, nil, false);
auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, selfOrResultPlaceholder);
runMPSGraph(stream, cachedGraph->graph(), feeds, gradInputPlaceholder);
if (out.has_storage()) {
grad_input.copy_(out);
}
}
}
TORCH_IMPL_FUNC(glu_out_mps)(const Tensor& self, const int64_t dim, const Tensor& output) {
using namespace mps;
using CachedGraph = MPSUnaryCachedGraph;

View File

@ -1,10 +1,8 @@
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
#include <ATen/Dispatch.h>
#include <ATen/TensorIterator.h>
#include <ATen/mps/MPSProfiler.h>
#include <ATen/native/Activation.h>
#include <ATen/native/mps/OperationUtils.h>
#include <ATen/native/mps/kernels/Activation.h>
#include <fmt/format.h>
namespace at::native {
@ -43,30 +41,6 @@ static void hardswish_backward_kernel(at::TensorIterator& iter) {
lib.exec_binary_kernel(iter, "hardswish_backward");
}
static void elu_kernel(TensorIteratorBase& iter, const Scalar& alpha, const Scalar& scale, const Scalar& input_scale) {
AT_DISPATCH_FLOATING_TYPES_AND2(c10::kHalf, c10::kBFloat16, iter.common_dtype(), "elu_mps", [&]() {
ELUParams<scalar_t> params{alpha.to<scalar_t>(), scale.to<scalar_t>(), input_scale.to<scalar_t>()};
lib.exec_unary_kernel_with_params(
iter, "elu", params, fmt::format("ELUParams_{}", mps::scalarToMetalTypeString(iter.common_dtype())));
});
}
static void elu_backward_kernel(TensorIteratorBase& iter,
const Scalar& alpha,
const Scalar& scale,
const Scalar& input_scale,
bool is_result) {
AT_DISPATCH_FLOATING_TYPES_AND2(c10::kHalf, c10::kBFloat16, iter.common_dtype(), "elu_backward_mps", [&]() {
ELUBackwardParams<scalar_t> params{
alpha.to<scalar_t>(), scale.to<scalar_t>(), input_scale.to<scalar_t>(), is_result};
lib.exec_binary_kernel_with_params(
iter,
"elu_backward",
params,
fmt::format("ELUBackwardParams_{}", mps::scalarToMetalTypeString(iter.common_dtype())));
});
}
static void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negative_slope) {
lib.exec_unary_kernel(iter, "leaky_relu", negative_slope);
}
@ -82,8 +56,6 @@ REGISTER_DISPATCH(hardsigmoid_stub, hardsigmoid_kernel);
REGISTER_DISPATCH(hardsigmoid_backward_stub, hardsigmoid_backward_kernel);
REGISTER_DISPATCH(hardswish_stub, hardswish_kernel);
REGISTER_DISPATCH(hardswish_backward_stub, hardswish_backward_kernel);
REGISTER_DISPATCH(elu_stub, elu_kernel);
REGISTER_DISPATCH(elu_backward_stub, elu_backward_kernel);
REGISTER_DISPATCH(leaky_relu_stub, leaky_relu_kernel);
REGISTER_DISPATCH(leaky_relu_backward_stub, leaky_relu_backward_kernel);

View File

@ -12064,7 +12064,8 @@
device_check: NoCheck # TensorIterator
python_module: nn
dispatch:
CPU, CUDA, MPS: elu_out
CPU, CUDA: elu_out
MPS: elu_out_mps
- func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor
structured_delegate: elu.out
@ -12077,7 +12078,8 @@
structured_inherits: TensorIteratorBase
python_module: nn
dispatch:
CPU, CUDA, MPS: elu_backward_out
CPU, CUDA: elu_backward_out
MPS: elu_backward_out_mps
- func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result) -> Tensor
structured_delegate: elu_backward.grad_input

View File

@ -8184,6 +8184,130 @@ class ReproTestsDevice(torch._dynamo.test_case.TestCase):
self.assertEqual(fn(torch.ones(3)), torch.ones(3) + 1)
def test_pytree_get_node_type_not_traced(self):
# Test that torch.utils._pytree._get_node_type is not traced into
# and doesn't cause excessive trace time overhead
from torch.utils._pytree import _get_node_type
cnt = torch._dynamo.testing.CompileCounter()
@torch.compile(backend=cnt, fullgraph=True)
def fn(x, y):
# Call _get_node_type which is used internally by pytree operations
node_type = _get_node_type([x, y])
assert node_type is list
# Do some work with pytree structures
data = {"a": x, "b": y}
flat, spec = pytree.tree_flatten(data)
result = flat[0] + flat[1]
return result
x = torch.randn(3, 4)
y = torch.randn(3, 4)
result = fn(x, y)
expected = x + y
self.assertTrue(torch.allclose(result, expected))
# Should compile successfully with fullgraph=True
self.assertEqual(cnt.frame_count, 1)
def test_pytree_get_node_type_with_namedtuple(self):
# Test that torch.utils._pytree._get_node_type handles namedtuples correctly
# without being traced into, even when is_namedtuple_class is True
from collections import namedtuple
from torch.utils._pytree import _get_node_type
Point = namedtuple("Point", ["x", "y"])
cnt = torch._dynamo.testing.CompileCounter()
@torch.compile(backend=cnt, fullgraph=True)
def fn(a, b):
# Create a namedtuple
point = Point(a, b)
# Call _get_node_type with a namedtuple instance
node_type = _get_node_type(point)
assert node_type is namedtuple
# Use pytree operations with namedtuples
flat, spec = pytree.tree_flatten(point)
result = flat[0] + flat[1]
return result
x = torch.randn(3, 4)
y = torch.randn(3, 4)
result = fn(x, y)
expected = x + y
self.assertTrue(torch.allclose(result, expected))
# Should compile successfully with fullgraph=True
self.assertEqual(cnt.frame_count, 1)
def test_pytree_tree_is_leaf_not_traced(self):
# Test that torch.utils._pytree.tree_is_leaf is not traced into
# when is_leaf parameter is None (the common case)
from torch.utils._pytree import tree_is_leaf
cnt = torch._dynamo.testing.CompileCounter()
@torch.compile(backend=cnt, fullgraph=True)
def fn(x, y):
# Test with various types
# Tensors are leaves
is_leaf_tensor = tree_is_leaf(x)
assert is_leaf_tensor is True
# Lists are not leaves (they're in SUPPORTED_NODES)
is_leaf_list = tree_is_leaf([x, y])
assert is_leaf_list is False
# Dicts are not leaves
is_leaf_dict = tree_is_leaf({"a": x, "b": y})
assert is_leaf_dict is False
return x + y
x = torch.randn(3, 4)
y = torch.randn(3, 4)
result = fn(x, y)
expected = x + y
self.assertTrue(torch.allclose(result, expected))
# Should compile successfully with fullgraph=True
self.assertEqual(cnt.frame_count, 1)
def test_pytree_tree_is_leaf_with_namedtuple(self):
# Test that torch.utils._pytree.tree_is_leaf handles namedtuples correctly
from collections import namedtuple
from torch.utils._pytree import tree_is_leaf
Point = namedtuple("Point", ["x", "y"])
cnt = torch._dynamo.testing.CompileCounter()
@torch.compile(backend=cnt, fullgraph=True)
def fn(a, b):
# Namedtuples are not leaves (they're in SUPPORTED_NODES)
point = Point(a, b)
is_leaf_namedtuple = tree_is_leaf(point)
assert is_leaf_namedtuple is False
# But individual tensors are leaves
is_leaf_tensor = tree_is_leaf(a)
assert is_leaf_tensor is True
return a + b
x = torch.randn(3, 4)
y = torch.randn(3, 4)
result = fn(x, y)
expected = x + y
self.assertTrue(torch.allclose(result, expected))
# Should compile successfully with fullgraph=True
self.assertEqual(cnt.frame_count, 1)
instantiate_parametrized_tests(ReproTests)

View File

@ -828,6 +828,9 @@ inductor_one_sample["cuda"] = {
"nn.functional.fractional_max_pool3d": {f16, f32, f64},
"nn.functional.group_norm": {f16},
"nn.functional.hinge_embedding_loss": {f16},
# Enabling all tests for this test fails randomly
# See https://github.com/pytorch/pytorch/issues/129238
"nn.functional.huber_loss": {f16},
"nn.functional.interpolate.bicubic": {f16},
"nn.functional.interpolate.bilinear": {f16},
"nn.functional.interpolate.trilinear": {f16},
@ -945,6 +948,9 @@ inductor_one_sample["xpu"] = {
"nn.functional.fractional_max_pool3d": {f16, f32, f64},
"nn.functional.group_norm": {f16},
"nn.functional.hinge_embedding_loss": {f16},
# Enabling all tests for this test fails randomly
# See https://github.com/pytorch/pytorch/issues/129238
"nn.functional.huber_loss": {f16},
"nn.functional.interpolate.bicubic": {f16},
"nn.functional.interpolate.bilinear": {f16},
"nn.functional.interpolate.trilinear": {f16},

View File

@ -421,7 +421,7 @@ RESET_GRAD_ACCUMULATOR = {"set_", "resize_"}
# inplace or out-variants)
# If the function does not modify its arguments, we also check the following properties
# pertaining to its output:
# 2) Its TensorImpl has use_count of 1 (or 2 if it has a PyObject)
# 2) Its TensorImpl has use_count of 1
# 3) If the function is a view function, it has the same StorageImpl as that of
# the input it is aliased with. Otherwise, its StorageImpl has use_count of 1
#
@ -496,10 +496,10 @@ if (${tensor_name}_impl_saved && !at::impl::dispatch_mode_enabled() && !at::impl
"""
)
ENFORCE_TENSOR_IMPL_USE_COUNT = CodeTemplate(
ENFORCE_TENSOR_IMPL_USE_COUNT_LT_OR_EQ_ONE = CodeTemplate(
"""\
if (!at::impl::dispatch_mode_enabled() && !at::impl::tensor_has_dispatch(${tensor_name}))
TORCH_INTERNAL_ASSERT(${tensor_name}.use_count() == expected_fresh_use_count(${tensor_name}), "function: ${fn_name}");
TORCH_INTERNAL_ASSERT(${tensor_name}.use_count() <= 1, "function: ${fn_name}");
"""
)
@ -1664,7 +1664,7 @@ def emit_body(
if type_wrapper_name(f) not in DONT_ENFORCE_TENSOR_IMPL_USE_COUNT:
stmts_after_call += [
ENFORCE_TENSOR_IMPL_USE_COUNT.substitute(
ENFORCE_TENSOR_IMPL_USE_COUNT_LT_OR_EQ_ONE.substitute(
tensor_name=ret_name, fn_name=type_wrapper_name(f)
)
]

View File

@ -47,18 +47,6 @@ namespace{
meta->grad_accumulator_.reset();
}
}
[[maybe_unused]] size_t expected_fresh_use_count(const Variable& self) {
if (!self.defined()) {
// An UndefinedTensorImpl always has a use count of 0
return 0;
}
if (self.unsafeGetTensorImpl()->pyobj_slot()->load_pyobj() != nullptr) {
// A TensorImpl with a Python object has a use count of 2
return 2;
}
// A fresh TensorImpl (with no PyObject) has a use count of 1
return 1;
}
}
namespace {

View File

@ -64,6 +64,8 @@ from .variables import (
LocalGeneratorObjectVariable,
NestedUserFunctionVariable,
PolyfilledFunctionVariable,
PyTreeGetNodeTypeFunctionVariable,
PyTreeTreeIsLeafFunctionVariable,
ReparametrizeModuleCallVariable,
SkipFunctionVariable,
TorchInGraphFunctionVariable,
@ -378,6 +380,8 @@ manual_torch_name_rule_map: dict[
f"torch/testing/_internal/distributed/_tensor/common_dtensor.py#{TORCH_DYNAMO_RESUME_IN_PREFIX}": UserFunctionVariable,
"torch/testing/_internal/common_distributed.py#forward": UserFunctionVariable,
f"torch/testing/_internal/common_distributed.py#{TORCH_DYNAMO_RESUME_IN_PREFIX}": UserFunctionVariable,
"torch.utils._pytree._get_node_type": PyTreeGetNodeTypeFunctionVariable,
"torch.utils._pytree.tree_is_leaf": PyTreeTreeIsLeafFunctionVariable,
}

View File

@ -64,6 +64,8 @@ from .functions import (
LocalGeneratorObjectVariable,
NestedUserFunctionVariable,
PolyfilledFunctionVariable,
PyTreeGetNodeTypeFunctionVariable,
PyTreeTreeIsLeafFunctionVariable,
SkipFunctionVariable,
TMADescriptorExperimentalVariable,
TMADescriptorStableVariable,

View File

@ -29,6 +29,7 @@ import logging
import sys
import traceback
import types
from collections import namedtuple
from collections.abc import Callable, Sequence
from types import CellType, FunctionType
from typing import Any, Optional, TYPE_CHECKING, TypeVar
@ -38,6 +39,7 @@ from weakref import WeakKeyDictionary
import torch
from torch._dynamo.exc import get_stack_above_dynamo
from torch._guards import Source
from torch.utils._pytree import is_namedtuple_class
from .. import config, graph_break_hints, polyfills, variables
from ..bytecode_transformation import create_call_function, create_rot_n, is_generator
@ -63,6 +65,8 @@ from ..source import (
DefaultsSource,
GetItemSource,
SkipGuardSource,
TorchSource,
TypeSource,
)
from ..utils import (
check_constant_args,
@ -115,6 +119,13 @@ CO_VARKEYWORDS = 0x08
_spec_cache: WeakKeyDictionary[Any, Any] = WeakKeyDictionary()
@functools.lru_cache
def get_pytree_SUPPORTED_NODES_source():
return AttrSource(
AttrSource(AttrSource(TorchSource(), "utils"), "_pytree"), "SUPPORTED_NODES"
)
class FunctionSpec:
def __init__(self, func: FunctionType):
code = func.__code__
@ -2717,3 +2728,96 @@ class CreateTMADescriptorStableVariable(VariableTracker):
tensor=tensor, # type: ignore[arg-type]
block_shape=block_shape, # type: ignore[arg-type]
)
class PyTreeGetNodeTypeFunctionVariable(UserFunctionVariable):
"""
`torch.utils._pytree._get_node_type` function is very hot function. We want to special case it to reduce Dynamo tracing time.
def _get_node_type(tree: Any) -> Any:
node_type = type(tree)
# All namedtuple types are implicitly registered as pytree nodes.
# XXX: Other parts of the codebase expect namedtuple types always return
# `namedtuple` instead of the actual namedtuple type. Even if the type
# is explicitly registered.
if is_namedtuple_class(node_type):
return namedtuple
return node_type
"""
def call_function(
self,
tx: "InstructionTranslator",
args: Sequence[VariableTracker],
kwargs: dict[str, VariableTracker],
) -> VariableTracker:
if len(args) != 1:
raise_type_error_exc(
tx,
f"pytree_get_node_type requires exactly 1 argument, got {len(args)}",
)
type_source = None
if args[0].source:
install_guard(args[0].source.make_guard(GuardBuilder.TYPE_MATCH))
type_source = TypeSource(args[0].source)
python_type = args[0].python_type()
if is_namedtuple_class(python_type):
return VariableTracker.build(tx, namedtuple)
return VariableTracker.build(tx, python_type, source=type_source)
class PyTreeTreeIsLeafFunctionVariable(UserFunctionVariable):
"""
`torch.utils._pytree.tree_is_leaf` function is a hot function. We want to special case it to reduce Dynamo tracing time.
def tree_is_leaf(
tree: PyTree,
is_leaf: Callable[[PyTree], bool] | None = None,
) -> bool:
if is_leaf is not None and is_leaf(tree):
return True
return _get_node_type(tree) not in SUPPORTED_NODES
When is_leaf is None (the common case), we can optimize by not tracing into the function.
When is_leaf is not None, we fall back to regular tracing since it requires executing user code.
"""
def call_function(
self,
tx: "InstructionTranslator",
args: Sequence[VariableTracker],
kwargs: dict[str, VariableTracker],
) -> VariableTracker:
# tree_is_leaf(tree, is_leaf=None)
if len(args) < 1 or len(args) > 2:
raise_type_error_exc(
tx,
f"tree_is_leaf requires 1 or 2 arguments, got {len(args)}",
)
# Check if is_leaf parameter is provided
is_leaf = kwargs.get("is_leaf", ConstantVariable.create(None))
if len(args) == 2:
is_leaf = args[1]
if not (
isinstance(is_leaf, variables.ConstantVariable) and is_leaf.value is None
):
return super().call_function(tx, args, kwargs)
# Optimize the case where is_leaf is None
# return _get_node_type(tree) not in SUPPORTED_NODES
tree = args[0]
node_type_var = PyTreeGetNodeTypeFunctionVariable(
torch.utils._pytree._get_node_type
).call_function(tx, [tree], {})
# If the SUPPORTED_NODES was seen earlier and mutated, there would be a
# source and that will give us the mutated SUPPORTED_NODES.
supported_nodes_var = VariableTracker.build(
tx,
torch.utils._pytree.SUPPORTED_NODES,
source=get_pytree_SUPPORTED_NODES_source(),
)
out = supported_nodes_var.call_method(tx, "__contains__", [node_type_var], {})
return ConstantVariable.create(not out.value)

View File

@ -138,7 +138,7 @@ inline void PyErr_SetString(PyObject* type, const std::string& message) {
throw; \
} \
} \
catch (const std::exception&) { \
catch (const std::exception& e) { \
torch::translate_exception_to_python(std::current_exception()); \
return retval; \
}

View File

@ -81,7 +81,7 @@ c10::intrusive_ptr<Backend> ProcessGroup::getBackend(
ProcessGroup::BackendType backendType{ProcessGroup::BackendType::UNDEFINED};
try {
backendType = deviceTypeToBackendType_.at(deviceType);
} catch (const std::out_of_range&) {
} catch (const std::out_of_range& e) {
TORCH_CHECK(
false, "No backend type associated with device type ", deviceType);
}

View File

@ -246,7 +246,7 @@ class UvTcpServer : public UvTcpSocket {
uv_err_name(uv_res),
uv_strerror(uv_res)));
res->cacheSocketPort();
} catch (std::exception&) {
} catch (std::exception& ex) {
res->close();
throw;
}
@ -322,7 +322,7 @@ class UvTcpServer : public UvTcpSocket {
uv_err_name(uv_res),
uv_strerror(uv_res)));
res->cacheSocketPort();
} catch (std::exception&) {
} catch (std::exception& ex) {
res->close();
throw;
}

View File

@ -353,7 +353,7 @@ static PyObject* NodeBase__update_args_kwargs(
Py_CLEAR(node->_kwargs);
node->_kwargs = map_aggregate(args[1], visit_fn);
Py_RETURN_NONE;
} catch (const PythonError&) {
} catch (const PythonError& e) {
return nullptr;
}
}
@ -397,7 +397,7 @@ static PyObject* NodeBase__replace_input_with(
PyObject* update_args[2] = {new_args.get(), new_kwargs.get()};
return NodeBase__update_args_kwargs(self, update_args, 2);
} catch (const PythonError&) {
} catch (const PythonError& e) {
return nullptr;
}
}
@ -802,7 +802,7 @@ static PyObject* py_map_aggregate(
// args[0]: aggregate, args[1]: callable fn
return map_aggregate(
args[0], [fn](PyObject* a) { return PyObject_CallOneArg(fn, a); });
} catch (const PythonError&) {
} catch (const PythonError& e) {
return nullptr; // error should already be set
}
}
@ -824,7 +824,7 @@ static PyObject* py_map_arg(
}
return Py_NewRef(a);
});
} catch (const PythonError&) {
} catch (const PythonError& e) {
return nullptr; // error should already be set
}
}

View File

@ -117,7 +117,7 @@ struct type_caster<torch::jit::IValue> {
try {
value = torch::jit::toTypeInferredIValue(src);
return true;
} catch (std::exception&) {
} catch (std::exception& e) {
return false;
}
}
@ -142,7 +142,7 @@ struct type_caster<torch::jit::Symbol> {
std::string src_str;
try {
src_str = py::cast<std::string>(src);
} catch (std::exception&) {
} catch (std::exception& e) {
return false;
}
value = torch::jit::Symbol::fromQualString(src_str);

View File

@ -285,7 +285,7 @@ struct FromImpl<torch::headeronly::HeaderOnlyArrayRef<T>> {
torch_list_push_back(new_list_handle, from(elem)));
}
return from(new_list_handle);
} catch (const std::runtime_error&) {
} catch (const std::runtime_error& e) {
if (new_list_handle != nullptr) {
// clean up memory if an error was thrown
TORCH_ERROR_CODE_CHECK(torch_delete_list(new_list_handle));
@ -553,7 +553,7 @@ struct ToImpl<std::vector<T>> {
}
TORCH_ERROR_CODE_CHECK(torch_delete_list(list_handle));
return result;
} catch (const std::runtime_error&) {
} catch (const std::runtime_error& e) {
// clean up memory if an exception is thrown, and rethrow
TORCH_ERROR_CODE_CHECK(torch_delete_list(list_handle));
throw;