mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-19 10:04:58 +08:00
Compare commits
7 Commits
update_sub
...
ciflow/tru
| Author | SHA1 | Date | |
|---|---|---|---|
| ec8c59a6ad | |||
| b6dcd5186e | |||
| 6406a010a8 | |||
| cc5a1c93de | |||
| dd3575139b | |||
| 1a55a4777c | |||
| 456833f78a |
@ -125,10 +125,10 @@ case "$tag" in
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks)
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.8.1
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=11
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
@ -146,6 +146,16 @@ case "$tag" in
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9)
|
||||
CUDA_VERSION=12.8.1
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-py3-clang12-onnx)
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
CLANG_VERSION=12
|
||||
|
||||
@ -23,7 +23,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
runner: linux.12xlarge.memory
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: '8.0 9.0'
|
||||
test-matrix: |
|
||||
@ -39,7 +39,7 @@ jobs:
|
||||
needs: attn-microbenchmark-build
|
||||
with:
|
||||
timeout-minutes: 500
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
docker-image: ${{ needs.attn-microbenchmark-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.attn-microbenchmark-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
@ -51,7 +51,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
runner: linux.12xlarge.memory
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: '10.0'
|
||||
test-matrix: |
|
||||
@ -66,7 +66,7 @@ jobs:
|
||||
needs: opmicrobenchmark-build-b200
|
||||
with:
|
||||
timeout-minutes: 500
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
||||
docker-image: ${{ needs.opmicrobenchmark-build-b200.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.opmicrobenchmark-build-b200.outputs.test-matrix }}
|
||||
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||
|
||||
3
.github/workflows/docker-builds.yml
vendored
3
.github/workflows/docker-builds.yml
vendored
@ -52,7 +52,8 @@ jobs:
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
|
||||
pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
|
||||
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
|
||||
pytorch-linux-jammy-py3.10-clang12,
|
||||
pytorch-linux-jammy-py3.11-clang12,
|
||||
|
||||
7
.github/workflows/docker-cache-rocm.yml
vendored
7
.github/workflows/docker-cache-rocm.yml
vendored
@ -50,10 +50,9 @@ jobs:
|
||||
matrix:
|
||||
runner: [linux.rocm.gfx942.docker-cache]
|
||||
docker-image: [
|
||||
"${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3 }}"
|
||||
#"${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3 }}",
|
||||
#"${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-noble-rocm-n-py3 }}",
|
||||
#"${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3-benchmarks }}"
|
||||
"${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3 }}",
|
||||
"${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-noble-rocm-n-py3 }}",
|
||||
"${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3-benchmarks }}"
|
||||
]
|
||||
runs-on: "${{ matrix.runner }}"
|
||||
steps:
|
||||
|
||||
10
.github/workflows/inductor-micro-benchmark.yml
vendored
10
.github/workflows/inductor-micro-benchmark.yml
vendored
@ -30,14 +30,14 @@ jobs:
|
||||
opt_out_experiments: lf
|
||||
|
||||
build:
|
||||
name: cuda12.8-py3.10-gcc11-sm80
|
||||
name: cuda12.8-py3.10-gcc9-sm80
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs:
|
||||
- get-default-label-prefix
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.0'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -46,11 +46,11 @@ jobs:
|
||||
secrets: inherit
|
||||
|
||||
test:
|
||||
name: cuda12.8-py3.10-gcc11-sm80
|
||||
name: cuda12.8-py3.10-gcc9-sm80
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
timeout-minutes: 720
|
||||
|
||||
10
.github/workflows/inductor-perf-compare.yml
vendored
10
.github/workflows/inductor-perf-compare.yml
vendored
@ -27,14 +27,14 @@ jobs:
|
||||
opt_out_experiments: lf
|
||||
|
||||
build:
|
||||
name: cuda12.8-py3.10-gcc11-sm80
|
||||
name: cuda12.8-py3.10-gcc9-sm80
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs:
|
||||
- get-default-label-prefix
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.0'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -47,11 +47,11 @@ jobs:
|
||||
secrets: inherit
|
||||
|
||||
test:
|
||||
name: cuda12.8-py3.10-gcc11-sm80
|
||||
name: cuda12.8-py3.10-gcc9-sm80
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
# disable monitor in perf tests for more investigation
|
||||
|
||||
18
.github/workflows/inductor-perf-test-b200.yml
vendored
18
.github/workflows/inductor-perf-test-b200.yml
vendored
@ -80,7 +80,7 @@ jobs:
|
||||
opt_out_experiments: lf
|
||||
|
||||
build:
|
||||
name: cuda12.8-py3.10-gcc11-sm100
|
||||
name: cuda12.8-py3.10-gcc9-sm100
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
@ -90,8 +90,8 @@ jobs:
|
||||
# from trunk. Also use a memory-intensive runner here because memory is
|
||||
# usually the bottleneck
|
||||
runner: linux.12xlarge.memory
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '10.0'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -104,12 +104,12 @@ jobs:
|
||||
secrets: inherit
|
||||
|
||||
test-periodically:
|
||||
name: cuda12.8-py3.10-gcc11-sm100
|
||||
name: cuda12.8-py3.10-gcc9-sm100
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
if: github.event.schedule == '0 7 * * 1-6'
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
||||
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
@ -121,12 +121,12 @@ jobs:
|
||||
secrets: inherit
|
||||
|
||||
test-weekly:
|
||||
name: cuda12.8-py3.10-gcc11-sm100
|
||||
name: cuda12.8-py3.10-gcc9-sm100
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
if: github.event.schedule == '0 7 * * 0'
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
||||
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
@ -138,11 +138,11 @@ jobs:
|
||||
secrets: inherit
|
||||
|
||||
test:
|
||||
name: cuda12.8-py3.10-gcc11-sm100
|
||||
name: cuda12.8-py3.10-gcc9-sm100
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
||||
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
|
||||
@ -95,8 +95,8 @@ jobs:
|
||||
# from trunk. Also use a memory-intensive runner here because memory is
|
||||
# usually the bottleneck
|
||||
runner: linux.12xlarge.memory
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '9.0'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -132,7 +132,7 @@ jobs:
|
||||
needs: build
|
||||
if: github.event.schedule == '15 0 * * 1-6'
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
|
||||
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
@ -149,7 +149,7 @@ jobs:
|
||||
needs: build
|
||||
if: github.event.schedule == '0 7 * * 0'
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
|
||||
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
@ -168,7 +168,7 @@ jobs:
|
||||
# needs one round of benchmark
|
||||
if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' }}
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
|
||||
dashboard-tag: training-${{ inputs.training || 'true' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cudagraphs-${{ inputs.cudagraphs || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'false' }}-aotinductor-${{ inputs.aotinductor || 'false' }}-maxautotune-${{ inputs.maxautotune || 'false' }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs || 'false' }}-cudagraphs_low_precision-${{ inputs.cudagraphs || 'false' }}
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
|
||||
18
.github/workflows/inductor-perf-test-nightly.yml
vendored
18
.github/workflows/inductor-perf-test-nightly.yml
vendored
@ -80,15 +80,15 @@ jobs:
|
||||
opt_out_experiments: lf
|
||||
|
||||
build:
|
||||
name: cuda12.8-py3.10-gcc11-sm80
|
||||
name: cuda12.8-py3.10-gcc9-sm80
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
# Every bit to make perf run faster helps
|
||||
runner: linux.12xlarge.memory
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.0'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -117,12 +117,12 @@ jobs:
|
||||
secrets: inherit
|
||||
|
||||
test-nightly:
|
||||
name: cuda12.8-py3.10-gcc11-sm80
|
||||
name: cuda12.8-py3.10-gcc9-sm80
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
if: github.event.schedule == '0 7 * * 1-6'
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
@ -133,12 +133,12 @@ jobs:
|
||||
secrets: inherit
|
||||
|
||||
test-weekly:
|
||||
name: cuda12.8-py3.10-gcc11-sm80
|
||||
name: cuda12.8-py3.10-gcc9-sm80
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
if: github.event.schedule == '0 7 * * 0'
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
@ -150,12 +150,12 @@ jobs:
|
||||
secrets: inherit
|
||||
|
||||
test:
|
||||
name: cuda12.8-py3.10-gcc11-sm80
|
||||
name: cuda12.8-py3.10-gcc9-sm80
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
if: github.event_name == 'workflow_dispatch'
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
|
||||
12
.github/workflows/inductor-periodic.yml
vendored
12
.github/workflows/inductor-periodic.yml
vendored
@ -37,8 +37,8 @@ jobs:
|
||||
needs: get-default-label-prefix
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.0;8.6'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -76,7 +76,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: periodic-dynamo-benchmarks-build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
|
||||
docker-image: ${{ needs.periodic-dynamo-benchmarks-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.periodic-dynamo-benchmarks-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
@ -138,8 +138,8 @@ jobs:
|
||||
- get-default-label-prefix
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.0'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -153,7 +153,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: inductor-smoke-build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
docker-image: ${{ needs.inductor-smoke-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-smoke-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
6
.github/workflows/inductor-unittest.yml
vendored
6
.github/workflows/inductor-unittest.yml
vendored
@ -33,8 +33,8 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.6'
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
test-matrix: |
|
||||
@ -52,7 +52,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: inductor-build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
|
||||
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
6
.github/workflows/inductor.yml
vendored
6
.github/workflows/inductor.yml
vendored
@ -49,8 +49,8 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.6'
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
test-matrix: |
|
||||
@ -69,7 +69,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: inductor-build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
|
||||
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
@ -25,7 +25,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
runner: linux.12xlarge.memory
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: '8.0 9.0'
|
||||
test-matrix: |
|
||||
@ -41,7 +41,7 @@ jobs:
|
||||
needs: opmicrobenchmark-build
|
||||
with:
|
||||
timeout-minutes: 500
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
docker-image: ${{ needs.opmicrobenchmark-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.opmicrobenchmark-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
@ -53,7 +53,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
runner: linux.12xlarge.memory
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: '10.0'
|
||||
test-matrix: |
|
||||
@ -68,7 +68,7 @@ jobs:
|
||||
needs: opmicrobenchmark-build-b200
|
||||
with:
|
||||
timeout-minutes: 500
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
||||
docker-image: ${{ needs.opmicrobenchmark-build-b200.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.opmicrobenchmark-build-b200.outputs.test-matrix }}
|
||||
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||
|
||||
51
.github/workflows/periodic.yml
vendored
51
.github/workflows/periodic.yml
vendored
@ -90,7 +90,6 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: 8.6
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
|
||||
@ -98,9 +97,7 @@ jobs:
|
||||
{ config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
|
||||
{ config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "multigpu", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu", owners: ["oncall:distributed"] },
|
||||
{ config: "multigpu", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu", owners: ["oncall:distributed"] },
|
||||
{ config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
@ -116,14 +113,40 @@ jobs:
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc11-debug-build:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc11-debug
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-build:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc9
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-debug
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9
|
||||
cuda-arch-list: 8.6
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "multigpu", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu", owners: ["oncall:distributed"] },
|
||||
{ config: "multigpu", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu", owners: ["oncall:distributed"] },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-test:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc9
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: linux-jammy-cuda12_8-py3_10-gcc9-build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-debug-build:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc9-debug
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-debug
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9
|
||||
cuda-arch-list: 8.9
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -137,16 +160,16 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc11-debug-test:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc11-debug
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-debug-test:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc9-debug
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs:
|
||||
- linux-jammy-cuda12_8-py3_10-gcc11-debug-build
|
||||
- linux-jammy-cuda12_8-py3_10-gcc9-debug-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-debug
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-debug-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-debug-build.outputs.test-matrix }}
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-debug
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-debug-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-debug-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda13_0-py3_10-gcc11-build:
|
||||
|
||||
20
.github/workflows/pull.yml
vendored
20
.github/workflows/pull.yml
vendored
@ -318,14 +318,14 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc11-inductor-build:
|
||||
name: cuda12.8-py3.10-gcc11-sm75
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
|
||||
name: cuda12.8-py3.10-gcc9-sm75
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm75
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '7.5'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -333,14 +333,14 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc11-inductor-test:
|
||||
name: cuda12.8-py3.10-gcc11-sm75
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
|
||||
name: cuda12.8-py3.10-gcc9-sm75
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: linux-jammy-cuda12_8-py3_10-gcc11-inductor-build
|
||||
needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm75
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-inductor-build.outputs.test-matrix }}
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-noble-xpu-n-py3_10-build:
|
||||
|
||||
10
.github/workflows/torchbench.yml
vendored
10
.github/workflows/torchbench.yml
vendored
@ -26,14 +26,14 @@ jobs:
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
build:
|
||||
name: cuda12.8-py3.10-gcc11-sm80
|
||||
name: cuda12.8-py3.10-gcc9-sm80
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs:
|
||||
- get-default-label-prefix
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.0'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -42,11 +42,11 @@ jobs:
|
||||
secrets: inherit
|
||||
|
||||
test:
|
||||
name: cuda12.8-py3.10-gcc11-sm80
|
||||
name: cuda12.8-py3.10-gcc9-sm80
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
4
.github/workflows/trunk.yml
vendored
4
.github/workflows/trunk.yml
vendored
@ -231,8 +231,8 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.12-gcc11-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||
build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
cuda-arch-list: '8.0'
|
||||
secrets: inherit
|
||||
|
||||
|
||||
@ -813,43 +813,8 @@ void smooth_l1_kernel(TensorIteratorBase& iter, double beta) {
|
||||
}
|
||||
|
||||
void huber_kernel(TensorIterator& iter, double delta) {
|
||||
// Special-case kHalf: compute in float for numerical stability
|
||||
if (iter.dtype() == kHalf) {
|
||||
const float delta_val(static_cast<float>(delta));
|
||||
const Vectorized<float> delta_vec(static_cast<float>(delta));
|
||||
const Vectorized<float> point_five_vec(static_cast<float>(0.5));
|
||||
cpu_kernel_vec(
|
||||
iter,
|
||||
// scalar lambda: convert half -> float, compute in float, cast back to half
|
||||
[&delta_val] (at::Half a, at::Half b) -> at::Half {
|
||||
float af = static_cast<float>(a);
|
||||
float bf = static_cast<float>(b);
|
||||
float z = std::abs(af - bf);
|
||||
float out = z < delta_val
|
||||
? 0.5f * z * z
|
||||
: delta_val * (z - 0.5f * delta_val);
|
||||
return static_cast<at::Half>(out);
|
||||
},
|
||||
[&delta_vec, &point_five_vec] (Vectorized<Half> a, Vectorized<Half> b) {
|
||||
auto [a0, a1] = convert_half_float(a);
|
||||
auto [b0, b1] = convert_half_float(b);
|
||||
auto z = (a0 - b0).abs();
|
||||
a0 = Vectorized<float>::blendv(
|
||||
point_five_vec * z * z,
|
||||
delta_vec * (z - point_five_vec * delta_vec),
|
||||
z >= delta_vec);
|
||||
z = (a1 - b1).abs();
|
||||
a1 = Vectorized<float>::blendv(
|
||||
point_five_vec * z * z,
|
||||
delta_vec * (z - point_five_vec * delta_vec),
|
||||
z >= delta_vec);
|
||||
return convert_float_half(a0, a1);
|
||||
}
|
||||
);
|
||||
return;
|
||||
}
|
||||
else {
|
||||
AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16, iter.dtype(), "huber_cpu", [&]() {
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(
|
||||
kBFloat16, kHalf, iter.dtype(), "huber_cpu", [&]() {
|
||||
using Vec = Vectorized<scalar_t>;
|
||||
const scalar_t delta_val(delta);
|
||||
const Vec delta_val_vec(delta_val);
|
||||
@ -870,7 +835,6 @@ void huber_kernel(TensorIterator& iter, double delta) {
|
||||
z >= delta_val_vec);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void sigmoid_backward_kernel(TensorIteratorBase& iter) {
|
||||
|
||||
@ -147,19 +147,6 @@ class MetalShaderLibrary {
|
||||
const std::optional<c10::Scalar> alpha = std::nullopt,
|
||||
const std::optional<c10::ScalarType> scalar_arg_type = std::nullopt);
|
||||
|
||||
template <typename T>
|
||||
void exec_unary_kernel_with_params(
|
||||
TensorIteratorBase& iter,
|
||||
const std::string& name,
|
||||
T params,
|
||||
const std::string& params_type_name);
|
||||
template <typename T>
|
||||
void exec_binary_kernel_with_params(
|
||||
TensorIteratorBase& iter,
|
||||
const std::string& name,
|
||||
T params,
|
||||
const std::string& params_type_name);
|
||||
|
||||
protected:
|
||||
virtual MTLLibrary_t getLibrary();
|
||||
virtual MTLLibrary_t getLibrary(
|
||||
|
||||
@ -7,12 +7,10 @@
|
||||
#include <ATen/Tensor.h>
|
||||
#include <ATen/TensorIterator.h>
|
||||
#include <ATen/Utils.h>
|
||||
#include <ATen/mps/MPSProfiler.h>
|
||||
#include <ATen/mps/MPSStream.h>
|
||||
#include <ATen/native/mps/MetalShaderLibrary.h>
|
||||
#include <ATen/native/mps/TensorFactory.h>
|
||||
#include <c10/core/ScalarType.h>
|
||||
#include <fmt/format.h>
|
||||
#include <torch/library.h>
|
||||
#include <unordered_map>
|
||||
|
||||
@ -632,147 +630,4 @@ inline bool needsGather(const TensorBase& t) {
|
||||
return !is_macOS_15_0_or_newer && (!t.is_contiguous() || t.storage_offset());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void MetalShaderLibrary::exec_unary_kernel_with_params(TensorIteratorBase& iter,
|
||||
const std::string& name,
|
||||
T params,
|
||||
const std::string& params_type_name) {
|
||||
using namespace at::mps;
|
||||
// Decompose 64-bit tensor into 32-bit ones
|
||||
if (!iter.can_use_32bit_indexing()) {
|
||||
for (auto&& sub_iter : iter.with_32bit_indexing()) {
|
||||
exec_unary_kernel_with_params(sub_iter, name, params, params_type_name);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
auto inputTensor = iter.input(0);
|
||||
auto outputTensor = iter.output(0);
|
||||
uint32_t length = iter.numel();
|
||||
if (length == 0) {
|
||||
return;
|
||||
}
|
||||
auto kernel_name = fmt::format("{}_{}_{}_{}{}",
|
||||
name,
|
||||
iter.is_contiguous() ? "dense" : "strided",
|
||||
scalarToMetalTypeString(outputTensor),
|
||||
scalarToMetalTypeString(inputTensor),
|
||||
fmt::format("_{}", params_type_name));
|
||||
@autoreleasepool {
|
||||
auto cplState = getPipelineStateForFunc(kernel_name);
|
||||
|
||||
MPSStream* mpsStream = getCurrentMPSStream();
|
||||
dispatch_sync(mpsStream->queue(), ^() {
|
||||
auto computeEncoder = mpsStream->commandEncoder();
|
||||
|
||||
getMPSProfiler().beginProfileKernel(cplState, name, {inputTensor});
|
||||
|
||||
[computeEncoder setComputePipelineState:cplState];
|
||||
bind_iter_tensors(computeEncoder, iter);
|
||||
if (!iter.is_contiguous()) {
|
||||
mtl_setArgs<2>(computeEncoder,
|
||||
outputTensor.sizes(),
|
||||
inputTensor.strides(),
|
||||
outputTensor.strides(),
|
||||
inputTensor.ndimension());
|
||||
}
|
||||
detail::mtl_setArg(computeEncoder, params, iter.is_contiguous() ? 2 : 6);
|
||||
mtl_dispatch1DJob(computeEncoder, cplState, length);
|
||||
|
||||
getMPSProfiler().endProfileKernel(cplState);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void MetalShaderLibrary::exec_binary_kernel_with_params(TensorIteratorBase& iter,
|
||||
const std::string& name,
|
||||
T params,
|
||||
const std::string& params_type_name) {
|
||||
using namespace mps;
|
||||
// TODO: Figure a better place to downcast double scalars (probably in tensor iterator itself?)
|
||||
// Right now running something like 1.0-torch.rand(5, device='mps') will create iterator with
|
||||
// double as common dtype (because Python floating point are always 64-bit values)
|
||||
TORCH_CHECK(iter.output().scalar_type() != at::kDouble, "float64 is not supported on MPS");
|
||||
|
||||
// Skip for empty iterators
|
||||
if (iter.numel() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Decompose 64-bit tensor into 32-bit ones
|
||||
if (!iter.can_use_32bit_indexing()) {
|
||||
for (auto&& sub_iter : iter.with_32bit_indexing()) {
|
||||
exec_binary_kernel_with_params(sub_iter, name, params, params_type_name);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
auto convert_double_scalar = [](Tensor& t) {
|
||||
if (t.dim() != 0) {
|
||||
return;
|
||||
}
|
||||
if (t.scalar_type() == kDouble) {
|
||||
t = t.to(kFloat);
|
||||
} else if (t.scalar_type() == kComplexDouble) {
|
||||
t = t.to(kComplexFloat);
|
||||
}
|
||||
};
|
||||
|
||||
Tensor input = iter.input(0);
|
||||
Tensor other = iter.input(1);
|
||||
Tensor out = iter.output();
|
||||
|
||||
convert_double_scalar(input);
|
||||
convert_double_scalar(other);
|
||||
|
||||
MPSStream* mpsStream = getCurrentMPSStream();
|
||||
const auto cast_needed = input.scalar_type() != other.scalar_type();
|
||||
const auto suffix = iter.is_contiguous() ? "dense" : "strided";
|
||||
// TODO: Implicitly pass both input and output types to non-cast kernels
|
||||
const auto kernel_name = cast_needed
|
||||
? fmt::format("{}_{}_cast_{}_{}", name, suffix, scalarToMetalTypeString(out), params_type_name)
|
||||
: fmt::format("{}_{}_{}_{}_{}",
|
||||
name,
|
||||
suffix,
|
||||
scalarToMetalTypeString(out),
|
||||
scalarToMetalTypeString(input),
|
||||
params_type_name);
|
||||
dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
|
||||
@autoreleasepool {
|
||||
auto computeEncoder = mpsStream->commandEncoder();
|
||||
auto binaryPSO = getPipelineStateForFunc(kernel_name);
|
||||
// this function call is a no-op if MPS Profiler is not enabled
|
||||
getMPSProfiler().beginProfileKernel(binaryPSO, kernel_name, {input, other});
|
||||
[computeEncoder setComputePipelineState:binaryPSO];
|
||||
// Set input and output tensors
|
||||
bind_iter_tensors(computeEncoder, iter);
|
||||
// Iterator is contiguous if all of its elements are dense in storage,
|
||||
// i.e. it's true for both row-first and column-first tensors
|
||||
if (iter.is_contiguous()) {
|
||||
detail::mtl_setArg(computeEncoder, params, 3);
|
||||
if (cast_needed) {
|
||||
std::array<int, 4> size_and_types = {static_cast<int>(c10::elementSize(input.scalar_type())),
|
||||
static_cast<int>(c10::elementSize(other.scalar_type())),
|
||||
static_cast<int>(input.scalar_type()),
|
||||
static_cast<int>(other.scalar_type())};
|
||||
mtl_setBytes(computeEncoder, size_and_types, 4);
|
||||
}
|
||||
} else {
|
||||
// Please note that shapes and strides of the iterator might be
|
||||
// different than that of its operands, for example binary op
|
||||
// between 4x4 tensor and scalar will result in 1D 16 element iterator
|
||||
std::array<int, 4> ndim_and_types = {iter.ndim(),
|
||||
static_cast<int>(input.scalar_type()),
|
||||
static_cast<int>(other.scalar_type()),
|
||||
static_cast<int>(out.scalar_type())};
|
||||
mtl_setArgs<3>(
|
||||
computeEncoder, params, iter.shape(), iter.strides(0), iter.strides(1), iter.strides(2), ndim_and_types);
|
||||
}
|
||||
mtl_dispatch1DJob(computeEncoder, binaryPSO, iter.numel());
|
||||
getMPSProfiler().endProfileKernel(binaryPSO);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace at::native::mps
|
||||
|
||||
@ -1,16 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
template <typename T>
|
||||
struct ELUParams {
|
||||
T alpha;
|
||||
T scale;
|
||||
T input_scale;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct ELUBackwardParams {
|
||||
T alpha;
|
||||
T scale;
|
||||
T input_scale;
|
||||
bool is_result;
|
||||
};
|
||||
@ -1,4 +1,3 @@
|
||||
#include <ATen/native/mps/kernels/Activation.h>
|
||||
#include <c10/metal/indexing.h>
|
||||
#include <c10/metal/special_math.h>
|
||||
#include <metal_stdlib>
|
||||
@ -100,59 +99,6 @@ REGISTER_BINARY_OP(hardswish_backward, float, float);
|
||||
REGISTER_BINARY_OP(hardswish_backward, half, half);
|
||||
REGISTER_BINARY_OP(hardswish_backward, bfloat, bfloat);
|
||||
|
||||
struct elu_functor {
|
||||
template <typename T>
|
||||
inline T operator()(const T self_, const ELUParams<T> params) {
|
||||
using op_T = opmath_t<T>;
|
||||
auto alpha = static_cast<op_T>(params.alpha);
|
||||
auto scale = static_cast<op_T>(params.scale);
|
||||
auto input_scale = static_cast<op_T>(params.input_scale);
|
||||
auto self = static_cast<op_T>(self_);
|
||||
auto neg_res = alpha * (::metal::precise::exp(self * input_scale) - 1);
|
||||
return static_cast<T>(scale * (self < 0 ? neg_res : self));
|
||||
}
|
||||
};
|
||||
|
||||
struct elu_backward_functor {
|
||||
template <typename T>
|
||||
inline T operator()(
|
||||
const T grad_output_,
|
||||
const T self_,
|
||||
ELUBackwardParams<T> params) {
|
||||
using op_T = opmath_t<T>;
|
||||
auto alpha = static_cast<op_T>(params.alpha);
|
||||
auto scale = static_cast<op_T>(params.scale);
|
||||
auto input_scale = static_cast<op_T>(params.input_scale);
|
||||
auto grad_output = static_cast<op_T>(grad_output_);
|
||||
auto self = static_cast<op_T>(self_);
|
||||
|
||||
if (params.is_result) {
|
||||
auto neg_coef = input_scale * (self + alpha * scale);
|
||||
return static_cast<T>(grad_output * (self <= 0 ? neg_coef : scale));
|
||||
} else {
|
||||
auto neg_coef = input_scale * alpha * scale *
|
||||
::metal::precise::exp(self * input_scale);
|
||||
return static_cast<T>(grad_output * (self <= 0 ? neg_coef : scale));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#define REGISTER_ELU_OP(T) \
|
||||
typedef ELUParams<T> ELUParams_##T; \
|
||||
REGISTER_UNARY_ALPHA_OP(elu, T, ELUParams_##T, T);
|
||||
|
||||
REGISTER_ELU_OP(float);
|
||||
REGISTER_ELU_OP(half);
|
||||
REGISTER_ELU_OP(bfloat);
|
||||
|
||||
#define REGISTER_ELU_BACKWARD_OP(T) \
|
||||
typedef ELUBackwardParams<T> ELUBackwardParams_##T; \
|
||||
REGISTER_BINARY_ALPHA_OP(elu_backward, T, ELUBackwardParams_##T, T);
|
||||
|
||||
REGISTER_ELU_BACKWARD_OP(float);
|
||||
REGISTER_ELU_BACKWARD_OP(half);
|
||||
REGISTER_ELU_BACKWARD_OP(bfloat);
|
||||
|
||||
struct leaky_relu_functor {
|
||||
template <typename T>
|
||||
inline T operator()(const T x, const T negative_slope) {
|
||||
|
||||
@ -11,6 +11,8 @@
|
||||
#include <ATen/ops/_log_softmax_native.h>
|
||||
#include <ATen/ops/_prelu_kernel_backward_native.h>
|
||||
#include <ATen/ops/_prelu_kernel_native.h>
|
||||
#include <ATen/ops/elu_backward_native.h>
|
||||
#include <ATen/ops/elu_native.h>
|
||||
#include <ATen/ops/gelu_backward_native.h>
|
||||
#include <ATen/ops/gelu_native.h>
|
||||
#include <ATen/ops/glu_backward_native.h>
|
||||
@ -696,6 +698,194 @@ TORCH_IMPL_FUNC(gelu_backward_out_mps)
|
||||
}
|
||||
}
|
||||
|
||||
static void elu_variants_out_mps(const Tensor& self,
|
||||
const Scalar& alpha,
|
||||
const Scalar& scale,
|
||||
const Scalar& input_scale,
|
||||
const Tensor& result,
|
||||
std::string func_name) {
|
||||
using namespace mps;
|
||||
using CachedGraph = MPSUnaryCachedGraph;
|
||||
|
||||
auto resultMemFormat = result.suggest_memory_format();
|
||||
bool executeGatherOp = !(self.is_contiguous(resultMemFormat) && result.is_contiguous(resultMemFormat));
|
||||
Tensor out;
|
||||
if (executeGatherOp) {
|
||||
out = at::empty_like(result, MemoryFormat::Contiguous);
|
||||
}
|
||||
|
||||
// Empty output
|
||||
if (result.numel() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
MPSStream* stream = getCurrentMPSStream();
|
||||
|
||||
@autoreleasepool {
|
||||
std::string key = func_name + ":" + getTensorsStringKey({self}) + ":" + std::to_string(alpha.to<double>()) + ":" +
|
||||
std::to_string(scale.to<double>()) + ":" + std::to_string(input_scale.to<double>());
|
||||
|
||||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
||||
MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
|
||||
|
||||
// scale * (max(0, x) + min(0, alpha * (exp(input_scale * x) - 1) ))
|
||||
|
||||
MPSGraphTensor* alphaTensor = [mpsGraph constantWithScalar:alpha.to<double>()
|
||||
shape:@[ @1 ]
|
||||
dataType:getMPSDataType(self)];
|
||||
|
||||
MPSGraphTensor* inputScaleTensor = [mpsGraph constantWithScalar:input_scale.to<double>()
|
||||
shape:@[ @1 ]
|
||||
dataType:getMPSDataType(self)];
|
||||
|
||||
MPSGraphTensor* scaleTensor = [mpsGraph constantWithScalar:scale.to<double>()
|
||||
shape:@[ @1 ]
|
||||
dataType:getMPSDataType(self)];
|
||||
MPSGraphTensor* unitTensor = [mpsGraph constantWithScalar:1.0f shape:@[ @1 ] dataType:getMPSDataType(self)];
|
||||
MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0f shape:@[ @1 ] dataType:getMPSDataType(self)];
|
||||
|
||||
MPSGraphTensor* scaledInputTensor = [mpsGraph multiplicationWithPrimaryTensor:inputTensor
|
||||
secondaryTensor:inputScaleTensor
|
||||
name:nil];
|
||||
MPSGraphTensor* exponentTensor = [mpsGraph exponentWithTensor:scaledInputTensor name:nil];
|
||||
MPSGraphTensor* exponentMinusOneTensor = [mpsGraph subtractionWithPrimaryTensor:exponentTensor
|
||||
secondaryTensor:unitTensor
|
||||
name:nil];
|
||||
MPSGraphTensor* alphaTimesTensor = [mpsGraph multiplicationWithPrimaryTensor:exponentMinusOneTensor
|
||||
secondaryTensor:alphaTensor
|
||||
name:nil];
|
||||
MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:inputTensor
|
||||
secondaryTensor:zeroTensor
|
||||
name:nil];
|
||||
MPSGraphTensor* fusedOutput = [mpsGraph selectWithPredicateTensor:predicateTensor
|
||||
truePredicateTensor:inputTensor
|
||||
falsePredicateTensor:alphaTimesTensor
|
||||
name:nil];
|
||||
MPSGraphTensor* outputTensor = [mpsGraph multiplicationWithPrimaryTensor:fusedOutput
|
||||
secondaryTensor:scaleTensor
|
||||
name:nil];
|
||||
|
||||
newCachedGraph->inputTensor_ = inputTensor;
|
||||
newCachedGraph->outputTensor_ = outputTensor;
|
||||
});
|
||||
|
||||
auto selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, nil, executeGatherOp);
|
||||
auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out.has_storage() ? out : result, nil, false);
|
||||
auto feeds = dictionaryFromPlaceholders(selfPlaceholder);
|
||||
runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
|
||||
if (out.has_storage()) {
|
||||
result.copy_(out);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// scale * (max(0, x) + min(0, alpha * (exp(input_scale * x) - 1) ))
|
||||
TORCH_IMPL_FUNC(elu_out_mps)
|
||||
(const Tensor& self, const Scalar& alpha, const Scalar& scale, const Scalar& input_scale, const Tensor& result) {
|
||||
elu_variants_out_mps(self, alpha, scale, input_scale, result, "elu_out_mps");
|
||||
}
|
||||
|
||||
TORCH_IMPL_FUNC(elu_backward_out_mps)
|
||||
(const Tensor& grad_output,
|
||||
const Scalar& alpha,
|
||||
const Scalar& scale,
|
||||
const Scalar& input_scale,
|
||||
bool is_result,
|
||||
const Tensor& self_or_result,
|
||||
const Tensor& grad_input) {
|
||||
using namespace mps;
|
||||
using CachedGraph = MPSUnaryGradCachedGraph;
|
||||
auto gradMemFormat = grad_input.suggest_memory_format();
|
||||
bool executeGatherOp = !(grad_output.is_contiguous(gradMemFormat) && self_or_result.is_contiguous(gradMemFormat) &&
|
||||
grad_input.is_contiguous(gradMemFormat));
|
||||
Tensor out;
|
||||
if (executeGatherOp && gradMemFormat == MemoryFormat::ChannelsLast) {
|
||||
out = at::empty_like(grad_input, MemoryFormat::Contiguous);
|
||||
}
|
||||
|
||||
// Empty output
|
||||
if (grad_input.numel() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
MPSStream* stream = getCurrentMPSStream();
|
||||
|
||||
@autoreleasepool {
|
||||
std::string key = "elu_backward_out_mps:" + getTensorsStringKey({grad_output, self_or_result}) + ":" +
|
||||
std::to_string(alpha.to<double>()) + ":" + std::to_string(scale.to<double>()) + ":" +
|
||||
std::to_string(input_scale.to<double>()) + ":" + std::to_string(is_result);
|
||||
|
||||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
||||
MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
|
||||
MPSGraphTensor* selfOrResultTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_or_result);
|
||||
MPSGraphTensor* lessThanZeroGradTensor = nil;
|
||||
|
||||
if (is_result) {
|
||||
MPSGraphTensor* alphaTensor = [mpsGraph constantWithScalar:alpha.to<double>()
|
||||
shape:@[ @1 ]
|
||||
dataType:getMPSDataType(grad_output)];
|
||||
MPSGraphTensor* resultPlusAlphaTensor = [mpsGraph additionWithPrimaryTensor:selfOrResultTensor
|
||||
secondaryTensor:alphaTensor
|
||||
name:nil];
|
||||
auto constMul = scale.to<double>() * input_scale.to<double>();
|
||||
MPSGraphTensor* constMulTensor = [mpsGraph constantWithScalar:constMul
|
||||
shape:@[ @1 ]
|
||||
dataType:getMPSDataType(grad_output)];
|
||||
lessThanZeroGradTensor = [mpsGraph multiplicationWithPrimaryTensor:resultPlusAlphaTensor
|
||||
secondaryTensor:constMulTensor
|
||||
name:nil];
|
||||
} else {
|
||||
MPSGraphTensor* inputScaleTensor = [mpsGraph constantWithScalar:input_scale.to<double>()
|
||||
shape:@[ @1 ]
|
||||
dataType:getMPSDataType(grad_output)];
|
||||
MPSGraphTensor* scaledInputTensor = [mpsGraph multiplicationWithPrimaryTensor:selfOrResultTensor
|
||||
secondaryTensor:inputScaleTensor
|
||||
name:nil];
|
||||
MPSGraphTensor* expTensor = [mpsGraph exponentWithTensor:scaledInputTensor name:nil];
|
||||
auto constMul = scale.to<double>() * input_scale.to<double>() * alpha.to<double>();
|
||||
MPSGraphTensor* constMulTensor = [mpsGraph constantWithScalar:constMul
|
||||
shape:@[ @1 ]
|
||||
dataType:getMPSDataType(grad_output)];
|
||||
lessThanZeroGradTensor = [mpsGraph multiplicationWithPrimaryTensor:expTensor
|
||||
secondaryTensor:constMulTensor
|
||||
name:nil];
|
||||
}
|
||||
|
||||
MPSGraphTensor* scaleTensor = [mpsGraph constantWithScalar:scale.to<double>()
|
||||
shape:@[ @1 ]
|
||||
dataType:getMPSDataType(grad_output)];
|
||||
MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0f
|
||||
shape:@[ @1 ]
|
||||
dataType:getMPSDataType(grad_output)];
|
||||
MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:selfOrResultTensor
|
||||
secondaryTensor:zeroTensor
|
||||
name:nil];
|
||||
MPSGraphTensor* gradTensor = [mpsGraph selectWithPredicateTensor:predicateTensor
|
||||
truePredicateTensor:scaleTensor
|
||||
falsePredicateTensor:lessThanZeroGradTensor
|
||||
name:nil];
|
||||
MPSGraphTensor* gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor:gradTensor
|
||||
secondaryTensor:gradOutputTensor
|
||||
name:nil];
|
||||
|
||||
newCachedGraph->gradOutputTensor_ = gradOutputTensor;
|
||||
newCachedGraph->inputTensor_ = selfOrResultTensor;
|
||||
newCachedGraph->gradInputTensor_ = gradInputTensor;
|
||||
});
|
||||
|
||||
Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output, nil, executeGatherOp);
|
||||
Placeholder selfOrResultPlaceholder = Placeholder(cachedGraph->inputTensor_, self_or_result, nil, executeGatherOp);
|
||||
Placeholder gradInputPlaceholder =
|
||||
Placeholder(cachedGraph->gradInputTensor_, out.has_storage() ? out : grad_input, nil, false);
|
||||
|
||||
auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, selfOrResultPlaceholder);
|
||||
runMPSGraph(stream, cachedGraph->graph(), feeds, gradInputPlaceholder);
|
||||
if (out.has_storage()) {
|
||||
grad_input.copy_(out);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TORCH_IMPL_FUNC(glu_out_mps)(const Tensor& self, const int64_t dim, const Tensor& output) {
|
||||
using namespace mps;
|
||||
using CachedGraph = MPSUnaryCachedGraph;
|
||||
|
||||
@ -1,10 +1,8 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/TensorIterator.h>
|
||||
#include <ATen/mps/MPSProfiler.h>
|
||||
#include <ATen/native/Activation.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <ATen/native/mps/kernels/Activation.h>
|
||||
#include <fmt/format.h>
|
||||
|
||||
namespace at::native {
|
||||
@ -43,30 +41,6 @@ static void hardswish_backward_kernel(at::TensorIterator& iter) {
|
||||
lib.exec_binary_kernel(iter, "hardswish_backward");
|
||||
}
|
||||
|
||||
static void elu_kernel(TensorIteratorBase& iter, const Scalar& alpha, const Scalar& scale, const Scalar& input_scale) {
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(c10::kHalf, c10::kBFloat16, iter.common_dtype(), "elu_mps", [&]() {
|
||||
ELUParams<scalar_t> params{alpha.to<scalar_t>(), scale.to<scalar_t>(), input_scale.to<scalar_t>()};
|
||||
lib.exec_unary_kernel_with_params(
|
||||
iter, "elu", params, fmt::format("ELUParams_{}", mps::scalarToMetalTypeString(iter.common_dtype())));
|
||||
});
|
||||
}
|
||||
|
||||
static void elu_backward_kernel(TensorIteratorBase& iter,
|
||||
const Scalar& alpha,
|
||||
const Scalar& scale,
|
||||
const Scalar& input_scale,
|
||||
bool is_result) {
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(c10::kHalf, c10::kBFloat16, iter.common_dtype(), "elu_backward_mps", [&]() {
|
||||
ELUBackwardParams<scalar_t> params{
|
||||
alpha.to<scalar_t>(), scale.to<scalar_t>(), input_scale.to<scalar_t>(), is_result};
|
||||
lib.exec_binary_kernel_with_params(
|
||||
iter,
|
||||
"elu_backward",
|
||||
params,
|
||||
fmt::format("ELUBackwardParams_{}", mps::scalarToMetalTypeString(iter.common_dtype())));
|
||||
});
|
||||
}
|
||||
|
||||
static void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negative_slope) {
|
||||
lib.exec_unary_kernel(iter, "leaky_relu", negative_slope);
|
||||
}
|
||||
@ -82,8 +56,6 @@ REGISTER_DISPATCH(hardsigmoid_stub, hardsigmoid_kernel);
|
||||
REGISTER_DISPATCH(hardsigmoid_backward_stub, hardsigmoid_backward_kernel);
|
||||
REGISTER_DISPATCH(hardswish_stub, hardswish_kernel);
|
||||
REGISTER_DISPATCH(hardswish_backward_stub, hardswish_backward_kernel);
|
||||
REGISTER_DISPATCH(elu_stub, elu_kernel);
|
||||
REGISTER_DISPATCH(elu_backward_stub, elu_backward_kernel);
|
||||
REGISTER_DISPATCH(leaky_relu_stub, leaky_relu_kernel);
|
||||
REGISTER_DISPATCH(leaky_relu_backward_stub, leaky_relu_backward_kernel);
|
||||
|
||||
|
||||
@ -12064,7 +12064,8 @@
|
||||
device_check: NoCheck # TensorIterator
|
||||
python_module: nn
|
||||
dispatch:
|
||||
CPU, CUDA, MPS: elu_out
|
||||
CPU, CUDA: elu_out
|
||||
MPS: elu_out_mps
|
||||
|
||||
- func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor
|
||||
structured_delegate: elu.out
|
||||
@ -12077,7 +12078,8 @@
|
||||
structured_inherits: TensorIteratorBase
|
||||
python_module: nn
|
||||
dispatch:
|
||||
CPU, CUDA, MPS: elu_backward_out
|
||||
CPU, CUDA: elu_backward_out
|
||||
MPS: elu_backward_out_mps
|
||||
|
||||
- func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result) -> Tensor
|
||||
structured_delegate: elu_backward.grad_input
|
||||
|
||||
@ -8184,6 +8184,130 @@ class ReproTestsDevice(torch._dynamo.test_case.TestCase):
|
||||
|
||||
self.assertEqual(fn(torch.ones(3)), torch.ones(3) + 1)
|
||||
|
||||
def test_pytree_get_node_type_not_traced(self):
|
||||
# Test that torch.utils._pytree._get_node_type is not traced into
|
||||
# and doesn't cause excessive trace time overhead
|
||||
from torch.utils._pytree import _get_node_type
|
||||
|
||||
cnt = torch._dynamo.testing.CompileCounter()
|
||||
|
||||
@torch.compile(backend=cnt, fullgraph=True)
|
||||
def fn(x, y):
|
||||
# Call _get_node_type which is used internally by pytree operations
|
||||
node_type = _get_node_type([x, y])
|
||||
assert node_type is list
|
||||
# Do some work with pytree structures
|
||||
data = {"a": x, "b": y}
|
||||
flat, spec = pytree.tree_flatten(data)
|
||||
result = flat[0] + flat[1]
|
||||
return result
|
||||
|
||||
x = torch.randn(3, 4)
|
||||
y = torch.randn(3, 4)
|
||||
result = fn(x, y)
|
||||
expected = x + y
|
||||
|
||||
self.assertTrue(torch.allclose(result, expected))
|
||||
# Should compile successfully with fullgraph=True
|
||||
self.assertEqual(cnt.frame_count, 1)
|
||||
|
||||
def test_pytree_get_node_type_with_namedtuple(self):
|
||||
# Test that torch.utils._pytree._get_node_type handles namedtuples correctly
|
||||
# without being traced into, even when is_namedtuple_class is True
|
||||
from collections import namedtuple
|
||||
|
||||
from torch.utils._pytree import _get_node_type
|
||||
|
||||
Point = namedtuple("Point", ["x", "y"])
|
||||
|
||||
cnt = torch._dynamo.testing.CompileCounter()
|
||||
|
||||
@torch.compile(backend=cnt, fullgraph=True)
|
||||
def fn(a, b):
|
||||
# Create a namedtuple
|
||||
point = Point(a, b)
|
||||
# Call _get_node_type with a namedtuple instance
|
||||
node_type = _get_node_type(point)
|
||||
assert node_type is namedtuple
|
||||
# Use pytree operations with namedtuples
|
||||
flat, spec = pytree.tree_flatten(point)
|
||||
result = flat[0] + flat[1]
|
||||
return result
|
||||
|
||||
x = torch.randn(3, 4)
|
||||
y = torch.randn(3, 4)
|
||||
result = fn(x, y)
|
||||
expected = x + y
|
||||
|
||||
self.assertTrue(torch.allclose(result, expected))
|
||||
# Should compile successfully with fullgraph=True
|
||||
self.assertEqual(cnt.frame_count, 1)
|
||||
|
||||
def test_pytree_tree_is_leaf_not_traced(self):
|
||||
# Test that torch.utils._pytree.tree_is_leaf is not traced into
|
||||
# when is_leaf parameter is None (the common case)
|
||||
from torch.utils._pytree import tree_is_leaf
|
||||
|
||||
cnt = torch._dynamo.testing.CompileCounter()
|
||||
|
||||
@torch.compile(backend=cnt, fullgraph=True)
|
||||
def fn(x, y):
|
||||
# Test with various types
|
||||
# Tensors are leaves
|
||||
is_leaf_tensor = tree_is_leaf(x)
|
||||
assert is_leaf_tensor is True
|
||||
|
||||
# Lists are not leaves (they're in SUPPORTED_NODES)
|
||||
is_leaf_list = tree_is_leaf([x, y])
|
||||
assert is_leaf_list is False
|
||||
|
||||
# Dicts are not leaves
|
||||
is_leaf_dict = tree_is_leaf({"a": x, "b": y})
|
||||
assert is_leaf_dict is False
|
||||
|
||||
return x + y
|
||||
|
||||
x = torch.randn(3, 4)
|
||||
y = torch.randn(3, 4)
|
||||
result = fn(x, y)
|
||||
expected = x + y
|
||||
|
||||
self.assertTrue(torch.allclose(result, expected))
|
||||
# Should compile successfully with fullgraph=True
|
||||
self.assertEqual(cnt.frame_count, 1)
|
||||
|
||||
def test_pytree_tree_is_leaf_with_namedtuple(self):
|
||||
# Test that torch.utils._pytree.tree_is_leaf handles namedtuples correctly
|
||||
from collections import namedtuple
|
||||
|
||||
from torch.utils._pytree import tree_is_leaf
|
||||
|
||||
Point = namedtuple("Point", ["x", "y"])
|
||||
|
||||
cnt = torch._dynamo.testing.CompileCounter()
|
||||
|
||||
@torch.compile(backend=cnt, fullgraph=True)
|
||||
def fn(a, b):
|
||||
# Namedtuples are not leaves (they're in SUPPORTED_NODES)
|
||||
point = Point(a, b)
|
||||
is_leaf_namedtuple = tree_is_leaf(point)
|
||||
assert is_leaf_namedtuple is False
|
||||
|
||||
# But individual tensors are leaves
|
||||
is_leaf_tensor = tree_is_leaf(a)
|
||||
assert is_leaf_tensor is True
|
||||
|
||||
return a + b
|
||||
|
||||
x = torch.randn(3, 4)
|
||||
y = torch.randn(3, 4)
|
||||
result = fn(x, y)
|
||||
expected = x + y
|
||||
|
||||
self.assertTrue(torch.allclose(result, expected))
|
||||
# Should compile successfully with fullgraph=True
|
||||
self.assertEqual(cnt.frame_count, 1)
|
||||
|
||||
|
||||
instantiate_parametrized_tests(ReproTests)
|
||||
|
||||
|
||||
@ -828,6 +828,9 @@ inductor_one_sample["cuda"] = {
|
||||
"nn.functional.fractional_max_pool3d": {f16, f32, f64},
|
||||
"nn.functional.group_norm": {f16},
|
||||
"nn.functional.hinge_embedding_loss": {f16},
|
||||
# Enabling all tests for this test fails randomly
|
||||
# See https://github.com/pytorch/pytorch/issues/129238
|
||||
"nn.functional.huber_loss": {f16},
|
||||
"nn.functional.interpolate.bicubic": {f16},
|
||||
"nn.functional.interpolate.bilinear": {f16},
|
||||
"nn.functional.interpolate.trilinear": {f16},
|
||||
@ -945,6 +948,9 @@ inductor_one_sample["xpu"] = {
|
||||
"nn.functional.fractional_max_pool3d": {f16, f32, f64},
|
||||
"nn.functional.group_norm": {f16},
|
||||
"nn.functional.hinge_embedding_loss": {f16},
|
||||
# Enabling all tests for this test fails randomly
|
||||
# See https://github.com/pytorch/pytorch/issues/129238
|
||||
"nn.functional.huber_loss": {f16},
|
||||
"nn.functional.interpolate.bicubic": {f16},
|
||||
"nn.functional.interpolate.bilinear": {f16},
|
||||
"nn.functional.interpolate.trilinear": {f16},
|
||||
|
||||
2
third_party/fbgemm
vendored
2
third_party/fbgemm
vendored
Submodule third_party/fbgemm updated: e1fc43e34e...c0b988d39a
@ -421,7 +421,7 @@ RESET_GRAD_ACCUMULATOR = {"set_", "resize_"}
|
||||
# inplace or out-variants)
|
||||
# If the function does not modify its arguments, we also check the following properties
|
||||
# pertaining to its output:
|
||||
# 2) Its TensorImpl has use_count of 1 (or 2 if it has a PyObject)
|
||||
# 2) Its TensorImpl has use_count of 1
|
||||
# 3) If the function is a view function, it has the same StorageImpl as that of
|
||||
# the input it is aliased with. Otherwise, its StorageImpl has use_count of 1
|
||||
#
|
||||
@ -496,10 +496,10 @@ if (${tensor_name}_impl_saved && !at::impl::dispatch_mode_enabled() && !at::impl
|
||||
"""
|
||||
)
|
||||
|
||||
ENFORCE_TENSOR_IMPL_USE_COUNT = CodeTemplate(
|
||||
ENFORCE_TENSOR_IMPL_USE_COUNT_LT_OR_EQ_ONE = CodeTemplate(
|
||||
"""\
|
||||
if (!at::impl::dispatch_mode_enabled() && !at::impl::tensor_has_dispatch(${tensor_name}))
|
||||
TORCH_INTERNAL_ASSERT(${tensor_name}.use_count() == expected_fresh_use_count(${tensor_name}), "function: ${fn_name}");
|
||||
TORCH_INTERNAL_ASSERT(${tensor_name}.use_count() <= 1, "function: ${fn_name}");
|
||||
"""
|
||||
)
|
||||
|
||||
@ -1664,7 +1664,7 @@ def emit_body(
|
||||
|
||||
if type_wrapper_name(f) not in DONT_ENFORCE_TENSOR_IMPL_USE_COUNT:
|
||||
stmts_after_call += [
|
||||
ENFORCE_TENSOR_IMPL_USE_COUNT.substitute(
|
||||
ENFORCE_TENSOR_IMPL_USE_COUNT_LT_OR_EQ_ONE.substitute(
|
||||
tensor_name=ret_name, fn_name=type_wrapper_name(f)
|
||||
)
|
||||
]
|
||||
|
||||
@ -47,18 +47,6 @@ namespace{
|
||||
meta->grad_accumulator_.reset();
|
||||
}
|
||||
}
|
||||
[[maybe_unused]] size_t expected_fresh_use_count(const Variable& self) {
|
||||
if (!self.defined()) {
|
||||
// An UndefinedTensorImpl always has a use count of 0
|
||||
return 0;
|
||||
}
|
||||
if (self.unsafeGetTensorImpl()->pyobj_slot()->load_pyobj() != nullptr) {
|
||||
// A TensorImpl with a Python object has a use count of 2
|
||||
return 2;
|
||||
}
|
||||
// A fresh TensorImpl (with no PyObject) has a use count of 1
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
@ -64,6 +64,8 @@ from .variables import (
|
||||
LocalGeneratorObjectVariable,
|
||||
NestedUserFunctionVariable,
|
||||
PolyfilledFunctionVariable,
|
||||
PyTreeGetNodeTypeFunctionVariable,
|
||||
PyTreeTreeIsLeafFunctionVariable,
|
||||
ReparametrizeModuleCallVariable,
|
||||
SkipFunctionVariable,
|
||||
TorchInGraphFunctionVariable,
|
||||
@ -378,6 +380,8 @@ manual_torch_name_rule_map: dict[
|
||||
f"torch/testing/_internal/distributed/_tensor/common_dtensor.py#{TORCH_DYNAMO_RESUME_IN_PREFIX}": UserFunctionVariable,
|
||||
"torch/testing/_internal/common_distributed.py#forward": UserFunctionVariable,
|
||||
f"torch/testing/_internal/common_distributed.py#{TORCH_DYNAMO_RESUME_IN_PREFIX}": UserFunctionVariable,
|
||||
"torch.utils._pytree._get_node_type": PyTreeGetNodeTypeFunctionVariable,
|
||||
"torch.utils._pytree.tree_is_leaf": PyTreeTreeIsLeafFunctionVariable,
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -64,6 +64,8 @@ from .functions import (
|
||||
LocalGeneratorObjectVariable,
|
||||
NestedUserFunctionVariable,
|
||||
PolyfilledFunctionVariable,
|
||||
PyTreeGetNodeTypeFunctionVariable,
|
||||
PyTreeTreeIsLeafFunctionVariable,
|
||||
SkipFunctionVariable,
|
||||
TMADescriptorExperimentalVariable,
|
||||
TMADescriptorStableVariable,
|
||||
|
||||
@ -29,6 +29,7 @@ import logging
|
||||
import sys
|
||||
import traceback
|
||||
import types
|
||||
from collections import namedtuple
|
||||
from collections.abc import Callable, Sequence
|
||||
from types import CellType, FunctionType
|
||||
from typing import Any, Optional, TYPE_CHECKING, TypeVar
|
||||
@ -38,6 +39,7 @@ from weakref import WeakKeyDictionary
|
||||
import torch
|
||||
from torch._dynamo.exc import get_stack_above_dynamo
|
||||
from torch._guards import Source
|
||||
from torch.utils._pytree import is_namedtuple_class
|
||||
|
||||
from .. import config, graph_break_hints, polyfills, variables
|
||||
from ..bytecode_transformation import create_call_function, create_rot_n, is_generator
|
||||
@ -63,6 +65,8 @@ from ..source import (
|
||||
DefaultsSource,
|
||||
GetItemSource,
|
||||
SkipGuardSource,
|
||||
TorchSource,
|
||||
TypeSource,
|
||||
)
|
||||
from ..utils import (
|
||||
check_constant_args,
|
||||
@ -115,6 +119,13 @@ CO_VARKEYWORDS = 0x08
|
||||
_spec_cache: WeakKeyDictionary[Any, Any] = WeakKeyDictionary()
|
||||
|
||||
|
||||
@functools.lru_cache
|
||||
def get_pytree_SUPPORTED_NODES_source():
|
||||
return AttrSource(
|
||||
AttrSource(AttrSource(TorchSource(), "utils"), "_pytree"), "SUPPORTED_NODES"
|
||||
)
|
||||
|
||||
|
||||
class FunctionSpec:
|
||||
def __init__(self, func: FunctionType):
|
||||
code = func.__code__
|
||||
@ -2717,3 +2728,96 @@ class CreateTMADescriptorStableVariable(VariableTracker):
|
||||
tensor=tensor, # type: ignore[arg-type]
|
||||
block_shape=block_shape, # type: ignore[arg-type]
|
||||
)
|
||||
|
||||
|
||||
class PyTreeGetNodeTypeFunctionVariable(UserFunctionVariable):
|
||||
"""
|
||||
`torch.utils._pytree._get_node_type` function is very hot function. We want to special case it to reduce Dynamo tracing time.
|
||||
|
||||
def _get_node_type(tree: Any) -> Any:
|
||||
node_type = type(tree)
|
||||
# All namedtuple types are implicitly registered as pytree nodes.
|
||||
# XXX: Other parts of the codebase expect namedtuple types always return
|
||||
# `namedtuple` instead of the actual namedtuple type. Even if the type
|
||||
# is explicitly registered.
|
||||
if is_namedtuple_class(node_type):
|
||||
return namedtuple
|
||||
return node_type
|
||||
"""
|
||||
|
||||
def call_function(
|
||||
self,
|
||||
tx: "InstructionTranslator",
|
||||
args: Sequence[VariableTracker],
|
||||
kwargs: dict[str, VariableTracker],
|
||||
) -> VariableTracker:
|
||||
if len(args) != 1:
|
||||
raise_type_error_exc(
|
||||
tx,
|
||||
f"pytree_get_node_type requires exactly 1 argument, got {len(args)}",
|
||||
)
|
||||
type_source = None
|
||||
if args[0].source:
|
||||
install_guard(args[0].source.make_guard(GuardBuilder.TYPE_MATCH))
|
||||
type_source = TypeSource(args[0].source)
|
||||
python_type = args[0].python_type()
|
||||
if is_namedtuple_class(python_type):
|
||||
return VariableTracker.build(tx, namedtuple)
|
||||
return VariableTracker.build(tx, python_type, source=type_source)
|
||||
|
||||
|
||||
class PyTreeTreeIsLeafFunctionVariable(UserFunctionVariable):
|
||||
"""
|
||||
`torch.utils._pytree.tree_is_leaf` function is a hot function. We want to special case it to reduce Dynamo tracing time.
|
||||
|
||||
def tree_is_leaf(
|
||||
tree: PyTree,
|
||||
is_leaf: Callable[[PyTree], bool] | None = None,
|
||||
) -> bool:
|
||||
if is_leaf is not None and is_leaf(tree):
|
||||
return True
|
||||
return _get_node_type(tree) not in SUPPORTED_NODES
|
||||
|
||||
When is_leaf is None (the common case), we can optimize by not tracing into the function.
|
||||
When is_leaf is not None, we fall back to regular tracing since it requires executing user code.
|
||||
"""
|
||||
|
||||
def call_function(
|
||||
self,
|
||||
tx: "InstructionTranslator",
|
||||
args: Sequence[VariableTracker],
|
||||
kwargs: dict[str, VariableTracker],
|
||||
) -> VariableTracker:
|
||||
# tree_is_leaf(tree, is_leaf=None)
|
||||
if len(args) < 1 or len(args) > 2:
|
||||
raise_type_error_exc(
|
||||
tx,
|
||||
f"tree_is_leaf requires 1 or 2 arguments, got {len(args)}",
|
||||
)
|
||||
|
||||
# Check if is_leaf parameter is provided
|
||||
is_leaf = kwargs.get("is_leaf", ConstantVariable.create(None))
|
||||
if len(args) == 2:
|
||||
is_leaf = args[1]
|
||||
|
||||
if not (
|
||||
isinstance(is_leaf, variables.ConstantVariable) and is_leaf.value is None
|
||||
):
|
||||
return super().call_function(tx, args, kwargs)
|
||||
|
||||
# Optimize the case where is_leaf is None
|
||||
# return _get_node_type(tree) not in SUPPORTED_NODES
|
||||
tree = args[0]
|
||||
node_type_var = PyTreeGetNodeTypeFunctionVariable(
|
||||
torch.utils._pytree._get_node_type
|
||||
).call_function(tx, [tree], {})
|
||||
|
||||
# If the SUPPORTED_NODES was seen earlier and mutated, there would be a
|
||||
# source and that will give us the mutated SUPPORTED_NODES.
|
||||
supported_nodes_var = VariableTracker.build(
|
||||
tx,
|
||||
torch.utils._pytree.SUPPORTED_NODES,
|
||||
source=get_pytree_SUPPORTED_NODES_source(),
|
||||
)
|
||||
out = supported_nodes_var.call_method(tx, "__contains__", [node_type_var], {})
|
||||
return ConstantVariable.create(not out.value)
|
||||
|
||||
@ -138,7 +138,7 @@ inline void PyErr_SetString(PyObject* type, const std::string& message) {
|
||||
throw; \
|
||||
} \
|
||||
} \
|
||||
catch (const std::exception&) { \
|
||||
catch (const std::exception& e) { \
|
||||
torch::translate_exception_to_python(std::current_exception()); \
|
||||
return retval; \
|
||||
}
|
||||
|
||||
@ -81,7 +81,7 @@ c10::intrusive_ptr<Backend> ProcessGroup::getBackend(
|
||||
ProcessGroup::BackendType backendType{ProcessGroup::BackendType::UNDEFINED};
|
||||
try {
|
||||
backendType = deviceTypeToBackendType_.at(deviceType);
|
||||
} catch (const std::out_of_range&) {
|
||||
} catch (const std::out_of_range& e) {
|
||||
TORCH_CHECK(
|
||||
false, "No backend type associated with device type ", deviceType);
|
||||
}
|
||||
|
||||
@ -246,7 +246,7 @@ class UvTcpServer : public UvTcpSocket {
|
||||
uv_err_name(uv_res),
|
||||
uv_strerror(uv_res)));
|
||||
res->cacheSocketPort();
|
||||
} catch (std::exception&) {
|
||||
} catch (std::exception& ex) {
|
||||
res->close();
|
||||
throw;
|
||||
}
|
||||
@ -322,7 +322,7 @@ class UvTcpServer : public UvTcpSocket {
|
||||
uv_err_name(uv_res),
|
||||
uv_strerror(uv_res)));
|
||||
res->cacheSocketPort();
|
||||
} catch (std::exception&) {
|
||||
} catch (std::exception& ex) {
|
||||
res->close();
|
||||
throw;
|
||||
}
|
||||
|
||||
@ -353,7 +353,7 @@ static PyObject* NodeBase__update_args_kwargs(
|
||||
Py_CLEAR(node->_kwargs);
|
||||
node->_kwargs = map_aggregate(args[1], visit_fn);
|
||||
Py_RETURN_NONE;
|
||||
} catch (const PythonError&) {
|
||||
} catch (const PythonError& e) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
@ -397,7 +397,7 @@ static PyObject* NodeBase__replace_input_with(
|
||||
|
||||
PyObject* update_args[2] = {new_args.get(), new_kwargs.get()};
|
||||
return NodeBase__update_args_kwargs(self, update_args, 2);
|
||||
} catch (const PythonError&) {
|
||||
} catch (const PythonError& e) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
@ -802,7 +802,7 @@ static PyObject* py_map_aggregate(
|
||||
// args[0]: aggregate, args[1]: callable fn
|
||||
return map_aggregate(
|
||||
args[0], [fn](PyObject* a) { return PyObject_CallOneArg(fn, a); });
|
||||
} catch (const PythonError&) {
|
||||
} catch (const PythonError& e) {
|
||||
return nullptr; // error should already be set
|
||||
}
|
||||
}
|
||||
@ -824,7 +824,7 @@ static PyObject* py_map_arg(
|
||||
}
|
||||
return Py_NewRef(a);
|
||||
});
|
||||
} catch (const PythonError&) {
|
||||
} catch (const PythonError& e) {
|
||||
return nullptr; // error should already be set
|
||||
}
|
||||
}
|
||||
|
||||
@ -117,7 +117,7 @@ struct type_caster<torch::jit::IValue> {
|
||||
try {
|
||||
value = torch::jit::toTypeInferredIValue(src);
|
||||
return true;
|
||||
} catch (std::exception&) {
|
||||
} catch (std::exception& e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@ -142,7 +142,7 @@ struct type_caster<torch::jit::Symbol> {
|
||||
std::string src_str;
|
||||
try {
|
||||
src_str = py::cast<std::string>(src);
|
||||
} catch (std::exception&) {
|
||||
} catch (std::exception& e) {
|
||||
return false;
|
||||
}
|
||||
value = torch::jit::Symbol::fromQualString(src_str);
|
||||
|
||||
@ -285,7 +285,7 @@ struct FromImpl<torch::headeronly::HeaderOnlyArrayRef<T>> {
|
||||
torch_list_push_back(new_list_handle, from(elem)));
|
||||
}
|
||||
return from(new_list_handle);
|
||||
} catch (const std::runtime_error&) {
|
||||
} catch (const std::runtime_error& e) {
|
||||
if (new_list_handle != nullptr) {
|
||||
// clean up memory if an error was thrown
|
||||
TORCH_ERROR_CODE_CHECK(torch_delete_list(new_list_handle));
|
||||
@ -553,7 +553,7 @@ struct ToImpl<std::vector<T>> {
|
||||
}
|
||||
TORCH_ERROR_CODE_CHECK(torch_delete_list(list_handle));
|
||||
return result;
|
||||
} catch (const std::runtime_error&) {
|
||||
} catch (const std::runtime_error& e) {
|
||||
// clean up memory if an exception is thrown, and rethrow
|
||||
TORCH_ERROR_CODE_CHECK(torch_delete_list(list_handle));
|
||||
throw;
|
||||
|
||||
Reference in New Issue
Block a user