mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-19 10:04:58 +08:00
Compare commits
7 Commits
ciflow/vll
...
update_sub
| Author | SHA1 | Date | |
|---|---|---|---|
| 989cf62265 | |||
| dc4f3c7505 | |||
| e8970ba010 | |||
| 41999a579d | |||
| ebb2001a48 | |||
| ae85307512 | |||
| 7921c0eb0e |
@ -125,10 +125,10 @@ case "$tag" in
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks)
|
||||
CUDA_VERSION=12.8.1
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=9
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
@ -146,16 +146,6 @@ case "$tag" in
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9)
|
||||
CUDA_VERSION=12.8.1
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-py3-clang12-onnx)
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
CLANG_VERSION=12
|
||||
|
||||
@ -23,7 +23,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
runner: linux.12xlarge.memory
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: '8.0 9.0'
|
||||
test-matrix: |
|
||||
@ -39,7 +39,7 @@ jobs:
|
||||
needs: attn-microbenchmark-build
|
||||
with:
|
||||
timeout-minutes: 500
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
docker-image: ${{ needs.attn-microbenchmark-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.attn-microbenchmark-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
@ -51,7 +51,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
runner: linux.12xlarge.memory
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: '10.0'
|
||||
test-matrix: |
|
||||
@ -66,7 +66,7 @@ jobs:
|
||||
needs: opmicrobenchmark-build-b200
|
||||
with:
|
||||
timeout-minutes: 500
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
|
||||
docker-image: ${{ needs.opmicrobenchmark-build-b200.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.opmicrobenchmark-build-b200.outputs.test-matrix }}
|
||||
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||
|
||||
3
.github/workflows/docker-builds.yml
vendored
3
.github/workflows/docker-builds.yml
vendored
@ -52,8 +52,7 @@ jobs:
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
|
||||
pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks,
|
||||
pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
|
||||
pytorch-linux-jammy-py3.10-clang12,
|
||||
pytorch-linux-jammy-py3.11-clang12,
|
||||
|
||||
7
.github/workflows/docker-cache-rocm.yml
vendored
7
.github/workflows/docker-cache-rocm.yml
vendored
@ -50,9 +50,10 @@ jobs:
|
||||
matrix:
|
||||
runner: [linux.rocm.gfx942.docker-cache]
|
||||
docker-image: [
|
||||
"${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3 }}",
|
||||
"${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-noble-rocm-n-py3 }}",
|
||||
"${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3-benchmarks }}"
|
||||
"${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3 }}"
|
||||
#"${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3 }}",
|
||||
#"${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-noble-rocm-n-py3 }}",
|
||||
#"${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3-benchmarks }}"
|
||||
]
|
||||
runs-on: "${{ matrix.runner }}"
|
||||
steps:
|
||||
|
||||
10
.github/workflows/inductor-micro-benchmark.yml
vendored
10
.github/workflows/inductor-micro-benchmark.yml
vendored
@ -30,14 +30,14 @@ jobs:
|
||||
opt_out_experiments: lf
|
||||
|
||||
build:
|
||||
name: cuda12.8-py3.10-gcc9-sm80
|
||||
name: cuda12.8-py3.10-gcc11-sm80
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs:
|
||||
- get-default-label-prefix
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||
cuda-arch-list: '8.0'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -46,11 +46,11 @@ jobs:
|
||||
secrets: inherit
|
||||
|
||||
test:
|
||||
name: cuda12.8-py3.10-gcc9-sm80
|
||||
name: cuda12.8-py3.10-gcc11-sm80
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
timeout-minutes: 720
|
||||
|
||||
10
.github/workflows/inductor-perf-compare.yml
vendored
10
.github/workflows/inductor-perf-compare.yml
vendored
@ -27,14 +27,14 @@ jobs:
|
||||
opt_out_experiments: lf
|
||||
|
||||
build:
|
||||
name: cuda12.8-py3.10-gcc9-sm80
|
||||
name: cuda12.8-py3.10-gcc11-sm80
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs:
|
||||
- get-default-label-prefix
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||
cuda-arch-list: '8.0'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -47,11 +47,11 @@ jobs:
|
||||
secrets: inherit
|
||||
|
||||
test:
|
||||
name: cuda12.8-py3.10-gcc9-sm80
|
||||
name: cuda12.8-py3.10-gcc11-sm80
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
# disable monitor in perf tests for more investigation
|
||||
|
||||
18
.github/workflows/inductor-perf-test-b200.yml
vendored
18
.github/workflows/inductor-perf-test-b200.yml
vendored
@ -80,7 +80,7 @@ jobs:
|
||||
opt_out_experiments: lf
|
||||
|
||||
build:
|
||||
name: cuda12.8-py3.10-gcc9-sm100
|
||||
name: cuda12.8-py3.10-gcc11-sm100
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
@ -90,8 +90,8 @@ jobs:
|
||||
# from trunk. Also use a memory-intensive runner here because memory is
|
||||
# usually the bottleneck
|
||||
runner: linux.12xlarge.memory
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||
cuda-arch-list: '10.0'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -104,12 +104,12 @@ jobs:
|
||||
secrets: inherit
|
||||
|
||||
test-periodically:
|
||||
name: cuda12.8-py3.10-gcc9-sm100
|
||||
name: cuda12.8-py3.10-gcc11-sm100
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
if: github.event.schedule == '0 7 * * 1-6'
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
|
||||
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
@ -121,12 +121,12 @@ jobs:
|
||||
secrets: inherit
|
||||
|
||||
test-weekly:
|
||||
name: cuda12.8-py3.10-gcc9-sm100
|
||||
name: cuda12.8-py3.10-gcc11-sm100
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
if: github.event.schedule == '0 7 * * 0'
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
|
||||
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
@ -138,11 +138,11 @@ jobs:
|
||||
secrets: inherit
|
||||
|
||||
test:
|
||||
name: cuda12.8-py3.10-gcc9-sm100
|
||||
name: cuda12.8-py3.10-gcc11-sm100
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
|
||||
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
|
||||
@ -95,8 +95,8 @@ jobs:
|
||||
# from trunk. Also use a memory-intensive runner here because memory is
|
||||
# usually the bottleneck
|
||||
runner: linux.12xlarge.memory
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||
cuda-arch-list: '9.0'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -132,7 +132,7 @@ jobs:
|
||||
needs: build
|
||||
if: github.event.schedule == '15 0 * * 1-6'
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
|
||||
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
@ -149,7 +149,7 @@ jobs:
|
||||
needs: build
|
||||
if: github.event.schedule == '0 7 * * 0'
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
|
||||
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
@ -168,7 +168,7 @@ jobs:
|
||||
# needs one round of benchmark
|
||||
if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' }}
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
|
||||
dashboard-tag: training-${{ inputs.training || 'true' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cudagraphs-${{ inputs.cudagraphs || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'false' }}-aotinductor-${{ inputs.aotinductor || 'false' }}-maxautotune-${{ inputs.maxautotune || 'false' }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs || 'false' }}-cudagraphs_low_precision-${{ inputs.cudagraphs || 'false' }}
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
|
||||
18
.github/workflows/inductor-perf-test-nightly.yml
vendored
18
.github/workflows/inductor-perf-test-nightly.yml
vendored
@ -80,15 +80,15 @@ jobs:
|
||||
opt_out_experiments: lf
|
||||
|
||||
build:
|
||||
name: cuda12.8-py3.10-gcc9-sm80
|
||||
name: cuda12.8-py3.10-gcc11-sm80
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
# Every bit to make perf run faster helps
|
||||
runner: linux.12xlarge.memory
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||
cuda-arch-list: '8.0'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -117,12 +117,12 @@ jobs:
|
||||
secrets: inherit
|
||||
|
||||
test-nightly:
|
||||
name: cuda12.8-py3.10-gcc9-sm80
|
||||
name: cuda12.8-py3.10-gcc11-sm80
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
if: github.event.schedule == '0 7 * * 1-6'
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
@ -133,12 +133,12 @@ jobs:
|
||||
secrets: inherit
|
||||
|
||||
test-weekly:
|
||||
name: cuda12.8-py3.10-gcc9-sm80
|
||||
name: cuda12.8-py3.10-gcc11-sm80
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
if: github.event.schedule == '0 7 * * 0'
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
@ -150,12 +150,12 @@ jobs:
|
||||
secrets: inherit
|
||||
|
||||
test:
|
||||
name: cuda12.8-py3.10-gcc9-sm80
|
||||
name: cuda12.8-py3.10-gcc11-sm80
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
if: github.event_name == 'workflow_dispatch'
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
|
||||
12
.github/workflows/inductor-periodic.yml
vendored
12
.github/workflows/inductor-periodic.yml
vendored
@ -37,8 +37,8 @@ jobs:
|
||||
needs: get-default-label-prefix
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||
cuda-arch-list: '8.0;8.6'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -76,7 +76,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: periodic-dynamo-benchmarks-build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
|
||||
docker-image: ${{ needs.periodic-dynamo-benchmarks-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.periodic-dynamo-benchmarks-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
@ -138,8 +138,8 @@ jobs:
|
||||
- get-default-label-prefix
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||
cuda-arch-list: '8.0'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -153,7 +153,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: inductor-smoke-build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
docker-image: ${{ needs.inductor-smoke-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-smoke-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
6
.github/workflows/inductor-unittest.yml
vendored
6
.github/workflows/inductor-unittest.yml
vendored
@ -33,8 +33,8 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||
cuda-arch-list: '8.6'
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
test-matrix: |
|
||||
@ -52,7 +52,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: inductor-build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
|
||||
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
6
.github/workflows/inductor.yml
vendored
6
.github/workflows/inductor.yml
vendored
@ -49,8 +49,8 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||
cuda-arch-list: '8.6'
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
test-matrix: |
|
||||
@ -69,7 +69,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: inductor-build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
|
||||
docker-image: ${{ needs.inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
@ -25,7 +25,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
runner: linux.12xlarge.memory
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: '8.0 9.0'
|
||||
test-matrix: |
|
||||
@ -41,7 +41,7 @@ jobs:
|
||||
needs: opmicrobenchmark-build
|
||||
with:
|
||||
timeout-minutes: 500
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
docker-image: ${{ needs.opmicrobenchmark-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.opmicrobenchmark-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
@ -53,7 +53,7 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
with:
|
||||
runner: linux.12xlarge.memory
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: '10.0'
|
||||
test-matrix: |
|
||||
@ -68,7 +68,7 @@ jobs:
|
||||
needs: opmicrobenchmark-build-b200
|
||||
with:
|
||||
timeout-minutes: 500
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
|
||||
docker-image: ${{ needs.opmicrobenchmark-build-b200.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.opmicrobenchmark-build-b200.outputs.test-matrix }}
|
||||
aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
|
||||
|
||||
51
.github/workflows/periodic.yml
vendored
51
.github/workflows/periodic.yml
vendored
@ -90,6 +90,7 @@ jobs:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: 8.6
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
|
||||
@ -97,7 +98,9 @@ jobs:
|
||||
{ config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
|
||||
{ config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
|
||||
{ config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
|
||||
{ config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
|
||||
{ config: "multigpu", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu", owners: ["oncall:distributed"] },
|
||||
{ config: "multigpu", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu", owners: ["oncall:distributed"] },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
@ -113,40 +116,14 @@ jobs:
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-build:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc9
|
||||
linux-jammy-cuda12_8-py3_10-gcc11-debug-build:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc11-debug
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9
|
||||
cuda-arch-list: 8.6
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "multigpu", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu", owners: ["oncall:distributed"] },
|
||||
{ config: "multigpu", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu", owners: ["oncall:distributed"] },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-test:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc9
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: linux-jammy-cuda12_8-py3_10-gcc9-build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-debug-build:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc9-debug
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-debug
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-debug
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
|
||||
cuda-arch-list: 8.9
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -160,16 +137,16 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-debug-test:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc9-debug
|
||||
linux-jammy-cuda12_8-py3_10-gcc11-debug-test:
|
||||
name: linux-jammy-cuda12.8-py3.10-gcc11-debug
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs:
|
||||
- linux-jammy-cuda12_8-py3_10-gcc9-debug-build
|
||||
- linux-jammy-cuda12_8-py3_10-gcc11-debug-build
|
||||
- target-determination
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-debug
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-debug-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-debug-build.outputs.test-matrix }}
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-debug
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-debug-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-debug-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda13_0-py3_10-gcc11-build:
|
||||
|
||||
20
.github/workflows/pull.yml
vendored
20
.github/workflows/pull.yml
vendored
@ -318,14 +318,14 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
|
||||
name: cuda12.8-py3.10-gcc9-sm75
|
||||
linux-jammy-cuda12_8-py3_10-gcc11-inductor-build:
|
||||
name: cuda12.8-py3.10-gcc11-sm75
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm75
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||
cuda-arch-list: '7.5'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -333,14 +333,14 @@ jobs:
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
|
||||
name: cuda12.8-py3.10-gcc9-sm75
|
||||
linux-jammy-cuda12_8-py3_10-gcc11-inductor-test:
|
||||
name: cuda12.8-py3.10-gcc11-sm75
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
|
||||
needs: linux-jammy-cuda12_8-py3_10-gcc11-inductor-build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm75
|
||||
docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-inductor-build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-inductor-build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
linux-noble-xpu-n-py3_10-build:
|
||||
|
||||
10
.github/workflows/torchbench.yml
vendored
10
.github/workflows/torchbench.yml
vendored
@ -26,14 +26,14 @@ jobs:
|
||||
curr_ref_type: ${{ github.ref_type }}
|
||||
|
||||
build:
|
||||
name: cuda12.8-py3.10-gcc9-sm80
|
||||
name: cuda12.8-py3.10-gcc11-sm80
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs:
|
||||
- get-default-label-prefix
|
||||
with:
|
||||
runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||
cuda-arch-list: '8.0'
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
@ -42,11 +42,11 @@ jobs:
|
||||
secrets: inherit
|
||||
|
||||
test:
|
||||
name: cuda12.8-py3.10-gcc9-sm80
|
||||
name: cuda12.8-py3.10-gcc11-sm80
|
||||
uses: ./.github/workflows/_linux-test.yml
|
||||
needs: build
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
|
||||
build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
|
||||
docker-image: ${{ needs.build.outputs.docker-image }}
|
||||
test-matrix: ${{ needs.build.outputs.test-matrix }}
|
||||
secrets: inherit
|
||||
|
||||
4
.github/workflows/trunk.yml
vendored
4
.github/workflows/trunk.yml
vendored
@ -231,8 +231,8 @@ jobs:
|
||||
uses: ./.github/workflows/_linux-build.yml
|
||||
needs: get-label-type
|
||||
with:
|
||||
build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
|
||||
build-environment: linux-jammy-cuda12.8-py3.12-gcc11-sm80
|
||||
docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
|
||||
cuda-arch-list: '8.0'
|
||||
secrets: inherit
|
||||
|
||||
|
||||
@ -813,8 +813,43 @@ void smooth_l1_kernel(TensorIteratorBase& iter, double beta) {
|
||||
}
|
||||
|
||||
void huber_kernel(TensorIterator& iter, double delta) {
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(
|
||||
kBFloat16, kHalf, iter.dtype(), "huber_cpu", [&]() {
|
||||
// Special-case kHalf: compute in float for numerical stability
|
||||
if (iter.dtype() == kHalf) {
|
||||
const float delta_val(static_cast<float>(delta));
|
||||
const Vectorized<float> delta_vec(static_cast<float>(delta));
|
||||
const Vectorized<float> point_five_vec(static_cast<float>(0.5));
|
||||
cpu_kernel_vec(
|
||||
iter,
|
||||
// scalar lambda: convert half -> float, compute in float, cast back to half
|
||||
[&delta_val] (at::Half a, at::Half b) -> at::Half {
|
||||
float af = static_cast<float>(a);
|
||||
float bf = static_cast<float>(b);
|
||||
float z = std::abs(af - bf);
|
||||
float out = z < delta_val
|
||||
? 0.5f * z * z
|
||||
: delta_val * (z - 0.5f * delta_val);
|
||||
return static_cast<at::Half>(out);
|
||||
},
|
||||
[&delta_vec, &point_five_vec] (Vectorized<Half> a, Vectorized<Half> b) {
|
||||
auto [a0, a1] = convert_half_float(a);
|
||||
auto [b0, b1] = convert_half_float(b);
|
||||
auto z = (a0 - b0).abs();
|
||||
a0 = Vectorized<float>::blendv(
|
||||
point_five_vec * z * z,
|
||||
delta_vec * (z - point_five_vec * delta_vec),
|
||||
z >= delta_vec);
|
||||
z = (a1 - b1).abs();
|
||||
a1 = Vectorized<float>::blendv(
|
||||
point_five_vec * z * z,
|
||||
delta_vec * (z - point_five_vec * delta_vec),
|
||||
z >= delta_vec);
|
||||
return convert_float_half(a0, a1);
|
||||
}
|
||||
);
|
||||
return;
|
||||
}
|
||||
else {
|
||||
AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16, iter.dtype(), "huber_cpu", [&]() {
|
||||
using Vec = Vectorized<scalar_t>;
|
||||
const scalar_t delta_val(delta);
|
||||
const Vec delta_val_vec(delta_val);
|
||||
@ -835,6 +870,7 @@ void huber_kernel(TensorIterator& iter, double delta) {
|
||||
z >= delta_val_vec);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void sigmoid_backward_kernel(TensorIteratorBase& iter) {
|
||||
|
||||
@ -147,6 +147,19 @@ class MetalShaderLibrary {
|
||||
const std::optional<c10::Scalar> alpha = std::nullopt,
|
||||
const std::optional<c10::ScalarType> scalar_arg_type = std::nullopt);
|
||||
|
||||
template <typename T>
|
||||
void exec_unary_kernel_with_params(
|
||||
TensorIteratorBase& iter,
|
||||
const std::string& name,
|
||||
T params,
|
||||
const std::string& params_type_name);
|
||||
template <typename T>
|
||||
void exec_binary_kernel_with_params(
|
||||
TensorIteratorBase& iter,
|
||||
const std::string& name,
|
||||
T params,
|
||||
const std::string& params_type_name);
|
||||
|
||||
protected:
|
||||
virtual MTLLibrary_t getLibrary();
|
||||
virtual MTLLibrary_t getLibrary(
|
||||
|
||||
@ -7,10 +7,12 @@
|
||||
#include <ATen/Tensor.h>
|
||||
#include <ATen/TensorIterator.h>
|
||||
#include <ATen/Utils.h>
|
||||
#include <ATen/mps/MPSProfiler.h>
|
||||
#include <ATen/mps/MPSStream.h>
|
||||
#include <ATen/native/mps/MetalShaderLibrary.h>
|
||||
#include <ATen/native/mps/TensorFactory.h>
|
||||
#include <c10/core/ScalarType.h>
|
||||
#include <fmt/format.h>
|
||||
#include <torch/library.h>
|
||||
#include <unordered_map>
|
||||
|
||||
@ -630,4 +632,147 @@ inline bool needsGather(const TensorBase& t) {
|
||||
return !is_macOS_15_0_or_newer && (!t.is_contiguous() || t.storage_offset());
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void MetalShaderLibrary::exec_unary_kernel_with_params(TensorIteratorBase& iter,
|
||||
const std::string& name,
|
||||
T params,
|
||||
const std::string& params_type_name) {
|
||||
using namespace at::mps;
|
||||
// Decompose 64-bit tensor into 32-bit ones
|
||||
if (!iter.can_use_32bit_indexing()) {
|
||||
for (auto&& sub_iter : iter.with_32bit_indexing()) {
|
||||
exec_unary_kernel_with_params(sub_iter, name, params, params_type_name);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
auto inputTensor = iter.input(0);
|
||||
auto outputTensor = iter.output(0);
|
||||
uint32_t length = iter.numel();
|
||||
if (length == 0) {
|
||||
return;
|
||||
}
|
||||
auto kernel_name = fmt::format("{}_{}_{}_{}{}",
|
||||
name,
|
||||
iter.is_contiguous() ? "dense" : "strided",
|
||||
scalarToMetalTypeString(outputTensor),
|
||||
scalarToMetalTypeString(inputTensor),
|
||||
fmt::format("_{}", params_type_name));
|
||||
@autoreleasepool {
|
||||
auto cplState = getPipelineStateForFunc(kernel_name);
|
||||
|
||||
MPSStream* mpsStream = getCurrentMPSStream();
|
||||
dispatch_sync(mpsStream->queue(), ^() {
|
||||
auto computeEncoder = mpsStream->commandEncoder();
|
||||
|
||||
getMPSProfiler().beginProfileKernel(cplState, name, {inputTensor});
|
||||
|
||||
[computeEncoder setComputePipelineState:cplState];
|
||||
bind_iter_tensors(computeEncoder, iter);
|
||||
if (!iter.is_contiguous()) {
|
||||
mtl_setArgs<2>(computeEncoder,
|
||||
outputTensor.sizes(),
|
||||
inputTensor.strides(),
|
||||
outputTensor.strides(),
|
||||
inputTensor.ndimension());
|
||||
}
|
||||
detail::mtl_setArg(computeEncoder, params, iter.is_contiguous() ? 2 : 6);
|
||||
mtl_dispatch1DJob(computeEncoder, cplState, length);
|
||||
|
||||
getMPSProfiler().endProfileKernel(cplState);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void MetalShaderLibrary::exec_binary_kernel_with_params(TensorIteratorBase& iter,
|
||||
const std::string& name,
|
||||
T params,
|
||||
const std::string& params_type_name) {
|
||||
using namespace mps;
|
||||
// TODO: Figure a better place to downcast double scalars (probably in tensor iterator itself?)
|
||||
// Right now running something like 1.0-torch.rand(5, device='mps') will create iterator with
|
||||
// double as common dtype (because Python floating point are always 64-bit values)
|
||||
TORCH_CHECK(iter.output().scalar_type() != at::kDouble, "float64 is not supported on MPS");
|
||||
|
||||
// Skip for empty iterators
|
||||
if (iter.numel() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Decompose 64-bit tensor into 32-bit ones
|
||||
if (!iter.can_use_32bit_indexing()) {
|
||||
for (auto&& sub_iter : iter.with_32bit_indexing()) {
|
||||
exec_binary_kernel_with_params(sub_iter, name, params, params_type_name);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
auto convert_double_scalar = [](Tensor& t) {
|
||||
if (t.dim() != 0) {
|
||||
return;
|
||||
}
|
||||
if (t.scalar_type() == kDouble) {
|
||||
t = t.to(kFloat);
|
||||
} else if (t.scalar_type() == kComplexDouble) {
|
||||
t = t.to(kComplexFloat);
|
||||
}
|
||||
};
|
||||
|
||||
Tensor input = iter.input(0);
|
||||
Tensor other = iter.input(1);
|
||||
Tensor out = iter.output();
|
||||
|
||||
convert_double_scalar(input);
|
||||
convert_double_scalar(other);
|
||||
|
||||
MPSStream* mpsStream = getCurrentMPSStream();
|
||||
const auto cast_needed = input.scalar_type() != other.scalar_type();
|
||||
const auto suffix = iter.is_contiguous() ? "dense" : "strided";
|
||||
// TODO: Implicitly pass both input and output types to non-cast kernels
|
||||
const auto kernel_name = cast_needed
|
||||
? fmt::format("{}_{}_cast_{}_{}", name, suffix, scalarToMetalTypeString(out), params_type_name)
|
||||
: fmt::format("{}_{}_{}_{}_{}",
|
||||
name,
|
||||
suffix,
|
||||
scalarToMetalTypeString(out),
|
||||
scalarToMetalTypeString(input),
|
||||
params_type_name);
|
||||
dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
|
||||
@autoreleasepool {
|
||||
auto computeEncoder = mpsStream->commandEncoder();
|
||||
auto binaryPSO = getPipelineStateForFunc(kernel_name);
|
||||
// this function call is a no-op if MPS Profiler is not enabled
|
||||
getMPSProfiler().beginProfileKernel(binaryPSO, kernel_name, {input, other});
|
||||
[computeEncoder setComputePipelineState:binaryPSO];
|
||||
// Set input and output tensors
|
||||
bind_iter_tensors(computeEncoder, iter);
|
||||
// Iterator is contiguous if all of its elements are dense in storage,
|
||||
// i.e. it's true for both row-first and column-first tensors
|
||||
if (iter.is_contiguous()) {
|
||||
detail::mtl_setArg(computeEncoder, params, 3);
|
||||
if (cast_needed) {
|
||||
std::array<int, 4> size_and_types = {static_cast<int>(c10::elementSize(input.scalar_type())),
|
||||
static_cast<int>(c10::elementSize(other.scalar_type())),
|
||||
static_cast<int>(input.scalar_type()),
|
||||
static_cast<int>(other.scalar_type())};
|
||||
mtl_setBytes(computeEncoder, size_and_types, 4);
|
||||
}
|
||||
} else {
|
||||
// Please note that shapes and strides of the iterator might be
|
||||
// different than that of its operands, for example binary op
|
||||
// between 4x4 tensor and scalar will result in 1D 16 element iterator
|
||||
std::array<int, 4> ndim_and_types = {iter.ndim(),
|
||||
static_cast<int>(input.scalar_type()),
|
||||
static_cast<int>(other.scalar_type()),
|
||||
static_cast<int>(out.scalar_type())};
|
||||
mtl_setArgs<3>(
|
||||
computeEncoder, params, iter.shape(), iter.strides(0), iter.strides(1), iter.strides(2), ndim_and_types);
|
||||
}
|
||||
mtl_dispatch1DJob(computeEncoder, binaryPSO, iter.numel());
|
||||
getMPSProfiler().endProfileKernel(binaryPSO);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace at::native::mps
|
||||
|
||||
16
aten/src/ATen/native/mps/kernels/Activation.h
Normal file
16
aten/src/ATen/native/mps/kernels/Activation.h
Normal file
@ -0,0 +1,16 @@
|
||||
#pragma once
|
||||
|
||||
template <typename T>
|
||||
struct ELUParams {
|
||||
T alpha;
|
||||
T scale;
|
||||
T input_scale;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct ELUBackwardParams {
|
||||
T alpha;
|
||||
T scale;
|
||||
T input_scale;
|
||||
bool is_result;
|
||||
};
|
||||
@ -1,3 +1,4 @@
|
||||
#include <ATen/native/mps/kernels/Activation.h>
|
||||
#include <c10/metal/indexing.h>
|
||||
#include <c10/metal/special_math.h>
|
||||
#include <metal_stdlib>
|
||||
@ -99,6 +100,59 @@ REGISTER_BINARY_OP(hardswish_backward, float, float);
|
||||
REGISTER_BINARY_OP(hardswish_backward, half, half);
|
||||
REGISTER_BINARY_OP(hardswish_backward, bfloat, bfloat);
|
||||
|
||||
struct elu_functor {
|
||||
template <typename T>
|
||||
inline T operator()(const T self_, const ELUParams<T> params) {
|
||||
using op_T = opmath_t<T>;
|
||||
auto alpha = static_cast<op_T>(params.alpha);
|
||||
auto scale = static_cast<op_T>(params.scale);
|
||||
auto input_scale = static_cast<op_T>(params.input_scale);
|
||||
auto self = static_cast<op_T>(self_);
|
||||
auto neg_res = alpha * (::metal::precise::exp(self * input_scale) - 1);
|
||||
return static_cast<T>(scale * (self < 0 ? neg_res : self));
|
||||
}
|
||||
};
|
||||
|
||||
struct elu_backward_functor {
|
||||
template <typename T>
|
||||
inline T operator()(
|
||||
const T grad_output_,
|
||||
const T self_,
|
||||
ELUBackwardParams<T> params) {
|
||||
using op_T = opmath_t<T>;
|
||||
auto alpha = static_cast<op_T>(params.alpha);
|
||||
auto scale = static_cast<op_T>(params.scale);
|
||||
auto input_scale = static_cast<op_T>(params.input_scale);
|
||||
auto grad_output = static_cast<op_T>(grad_output_);
|
||||
auto self = static_cast<op_T>(self_);
|
||||
|
||||
if (params.is_result) {
|
||||
auto neg_coef = input_scale * (self + alpha * scale);
|
||||
return static_cast<T>(grad_output * (self <= 0 ? neg_coef : scale));
|
||||
} else {
|
||||
auto neg_coef = input_scale * alpha * scale *
|
||||
::metal::precise::exp(self * input_scale);
|
||||
return static_cast<T>(grad_output * (self <= 0 ? neg_coef : scale));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#define REGISTER_ELU_OP(T) \
|
||||
typedef ELUParams<T> ELUParams_##T; \
|
||||
REGISTER_UNARY_ALPHA_OP(elu, T, ELUParams_##T, T);
|
||||
|
||||
REGISTER_ELU_OP(float);
|
||||
REGISTER_ELU_OP(half);
|
||||
REGISTER_ELU_OP(bfloat);
|
||||
|
||||
#define REGISTER_ELU_BACKWARD_OP(T) \
|
||||
typedef ELUBackwardParams<T> ELUBackwardParams_##T; \
|
||||
REGISTER_BINARY_ALPHA_OP(elu_backward, T, ELUBackwardParams_##T, T);
|
||||
|
||||
REGISTER_ELU_BACKWARD_OP(float);
|
||||
REGISTER_ELU_BACKWARD_OP(half);
|
||||
REGISTER_ELU_BACKWARD_OP(bfloat);
|
||||
|
||||
struct leaky_relu_functor {
|
||||
template <typename T>
|
||||
inline T operator()(const T x, const T negative_slope) {
|
||||
|
||||
@ -11,8 +11,6 @@
|
||||
#include <ATen/ops/_log_softmax_native.h>
|
||||
#include <ATen/ops/_prelu_kernel_backward_native.h>
|
||||
#include <ATen/ops/_prelu_kernel_native.h>
|
||||
#include <ATen/ops/elu_backward_native.h>
|
||||
#include <ATen/ops/elu_native.h>
|
||||
#include <ATen/ops/gelu_backward_native.h>
|
||||
#include <ATen/ops/gelu_native.h>
|
||||
#include <ATen/ops/glu_backward_native.h>
|
||||
@ -698,194 +696,6 @@ TORCH_IMPL_FUNC(gelu_backward_out_mps)
|
||||
}
|
||||
}
|
||||
|
||||
static void elu_variants_out_mps(const Tensor& self,
|
||||
const Scalar& alpha,
|
||||
const Scalar& scale,
|
||||
const Scalar& input_scale,
|
||||
const Tensor& result,
|
||||
std::string func_name) {
|
||||
using namespace mps;
|
||||
using CachedGraph = MPSUnaryCachedGraph;
|
||||
|
||||
auto resultMemFormat = result.suggest_memory_format();
|
||||
bool executeGatherOp = !(self.is_contiguous(resultMemFormat) && result.is_contiguous(resultMemFormat));
|
||||
Tensor out;
|
||||
if (executeGatherOp) {
|
||||
out = at::empty_like(result, MemoryFormat::Contiguous);
|
||||
}
|
||||
|
||||
// Empty output
|
||||
if (result.numel() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
MPSStream* stream = getCurrentMPSStream();
|
||||
|
||||
@autoreleasepool {
|
||||
std::string key = func_name + ":" + getTensorsStringKey({self}) + ":" + std::to_string(alpha.to<double>()) + ":" +
|
||||
std::to_string(scale.to<double>()) + ":" + std::to_string(input_scale.to<double>());
|
||||
|
||||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
||||
MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
|
||||
|
||||
// scale * (max(0, x) + min(0, alpha * (exp(input_scale * x) - 1) ))
|
||||
|
||||
MPSGraphTensor* alphaTensor = [mpsGraph constantWithScalar:alpha.to<double>()
|
||||
shape:@[ @1 ]
|
||||
dataType:getMPSDataType(self)];
|
||||
|
||||
MPSGraphTensor* inputScaleTensor = [mpsGraph constantWithScalar:input_scale.to<double>()
|
||||
shape:@[ @1 ]
|
||||
dataType:getMPSDataType(self)];
|
||||
|
||||
MPSGraphTensor* scaleTensor = [mpsGraph constantWithScalar:scale.to<double>()
|
||||
shape:@[ @1 ]
|
||||
dataType:getMPSDataType(self)];
|
||||
MPSGraphTensor* unitTensor = [mpsGraph constantWithScalar:1.0f shape:@[ @1 ] dataType:getMPSDataType(self)];
|
||||
MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0f shape:@[ @1 ] dataType:getMPSDataType(self)];
|
||||
|
||||
MPSGraphTensor* scaledInputTensor = [mpsGraph multiplicationWithPrimaryTensor:inputTensor
|
||||
secondaryTensor:inputScaleTensor
|
||||
name:nil];
|
||||
MPSGraphTensor* exponentTensor = [mpsGraph exponentWithTensor:scaledInputTensor name:nil];
|
||||
MPSGraphTensor* exponentMinusOneTensor = [mpsGraph subtractionWithPrimaryTensor:exponentTensor
|
||||
secondaryTensor:unitTensor
|
||||
name:nil];
|
||||
MPSGraphTensor* alphaTimesTensor = [mpsGraph multiplicationWithPrimaryTensor:exponentMinusOneTensor
|
||||
secondaryTensor:alphaTensor
|
||||
name:nil];
|
||||
MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:inputTensor
|
||||
secondaryTensor:zeroTensor
|
||||
name:nil];
|
||||
MPSGraphTensor* fusedOutput = [mpsGraph selectWithPredicateTensor:predicateTensor
|
||||
truePredicateTensor:inputTensor
|
||||
falsePredicateTensor:alphaTimesTensor
|
||||
name:nil];
|
||||
MPSGraphTensor* outputTensor = [mpsGraph multiplicationWithPrimaryTensor:fusedOutput
|
||||
secondaryTensor:scaleTensor
|
||||
name:nil];
|
||||
|
||||
newCachedGraph->inputTensor_ = inputTensor;
|
||||
newCachedGraph->outputTensor_ = outputTensor;
|
||||
});
|
||||
|
||||
auto selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, nil, executeGatherOp);
|
||||
auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out.has_storage() ? out : result, nil, false);
|
||||
auto feeds = dictionaryFromPlaceholders(selfPlaceholder);
|
||||
runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
|
||||
if (out.has_storage()) {
|
||||
result.copy_(out);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// scale * (max(0, x) + min(0, alpha * (exp(input_scale * x) - 1) ))
|
||||
TORCH_IMPL_FUNC(elu_out_mps)
|
||||
(const Tensor& self, const Scalar& alpha, const Scalar& scale, const Scalar& input_scale, const Tensor& result) {
|
||||
elu_variants_out_mps(self, alpha, scale, input_scale, result, "elu_out_mps");
|
||||
}
|
||||
|
||||
TORCH_IMPL_FUNC(elu_backward_out_mps)
|
||||
(const Tensor& grad_output,
|
||||
const Scalar& alpha,
|
||||
const Scalar& scale,
|
||||
const Scalar& input_scale,
|
||||
bool is_result,
|
||||
const Tensor& self_or_result,
|
||||
const Tensor& grad_input) {
|
||||
using namespace mps;
|
||||
using CachedGraph = MPSUnaryGradCachedGraph;
|
||||
auto gradMemFormat = grad_input.suggest_memory_format();
|
||||
bool executeGatherOp = !(grad_output.is_contiguous(gradMemFormat) && self_or_result.is_contiguous(gradMemFormat) &&
|
||||
grad_input.is_contiguous(gradMemFormat));
|
||||
Tensor out;
|
||||
if (executeGatherOp && gradMemFormat == MemoryFormat::ChannelsLast) {
|
||||
out = at::empty_like(grad_input, MemoryFormat::Contiguous);
|
||||
}
|
||||
|
||||
// Empty output
|
||||
if (grad_input.numel() == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
MPSStream* stream = getCurrentMPSStream();
|
||||
|
||||
@autoreleasepool {
|
||||
std::string key = "elu_backward_out_mps:" + getTensorsStringKey({grad_output, self_or_result}) + ":" +
|
||||
std::to_string(alpha.to<double>()) + ":" + std::to_string(scale.to<double>()) + ":" +
|
||||
std::to_string(input_scale.to<double>()) + ":" + std::to_string(is_result);
|
||||
|
||||
auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
|
||||
MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
|
||||
MPSGraphTensor* selfOrResultTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_or_result);
|
||||
MPSGraphTensor* lessThanZeroGradTensor = nil;
|
||||
|
||||
if (is_result) {
|
||||
MPSGraphTensor* alphaTensor = [mpsGraph constantWithScalar:alpha.to<double>()
|
||||
shape:@[ @1 ]
|
||||
dataType:getMPSDataType(grad_output)];
|
||||
MPSGraphTensor* resultPlusAlphaTensor = [mpsGraph additionWithPrimaryTensor:selfOrResultTensor
|
||||
secondaryTensor:alphaTensor
|
||||
name:nil];
|
||||
auto constMul = scale.to<double>() * input_scale.to<double>();
|
||||
MPSGraphTensor* constMulTensor = [mpsGraph constantWithScalar:constMul
|
||||
shape:@[ @1 ]
|
||||
dataType:getMPSDataType(grad_output)];
|
||||
lessThanZeroGradTensor = [mpsGraph multiplicationWithPrimaryTensor:resultPlusAlphaTensor
|
||||
secondaryTensor:constMulTensor
|
||||
name:nil];
|
||||
} else {
|
||||
MPSGraphTensor* inputScaleTensor = [mpsGraph constantWithScalar:input_scale.to<double>()
|
||||
shape:@[ @1 ]
|
||||
dataType:getMPSDataType(grad_output)];
|
||||
MPSGraphTensor* scaledInputTensor = [mpsGraph multiplicationWithPrimaryTensor:selfOrResultTensor
|
||||
secondaryTensor:inputScaleTensor
|
||||
name:nil];
|
||||
MPSGraphTensor* expTensor = [mpsGraph exponentWithTensor:scaledInputTensor name:nil];
|
||||
auto constMul = scale.to<double>() * input_scale.to<double>() * alpha.to<double>();
|
||||
MPSGraphTensor* constMulTensor = [mpsGraph constantWithScalar:constMul
|
||||
shape:@[ @1 ]
|
||||
dataType:getMPSDataType(grad_output)];
|
||||
lessThanZeroGradTensor = [mpsGraph multiplicationWithPrimaryTensor:expTensor
|
||||
secondaryTensor:constMulTensor
|
||||
name:nil];
|
||||
}
|
||||
|
||||
MPSGraphTensor* scaleTensor = [mpsGraph constantWithScalar:scale.to<double>()
|
||||
shape:@[ @1 ]
|
||||
dataType:getMPSDataType(grad_output)];
|
||||
MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0f
|
||||
shape:@[ @1 ]
|
||||
dataType:getMPSDataType(grad_output)];
|
||||
MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:selfOrResultTensor
|
||||
secondaryTensor:zeroTensor
|
||||
name:nil];
|
||||
MPSGraphTensor* gradTensor = [mpsGraph selectWithPredicateTensor:predicateTensor
|
||||
truePredicateTensor:scaleTensor
|
||||
falsePredicateTensor:lessThanZeroGradTensor
|
||||
name:nil];
|
||||
MPSGraphTensor* gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor:gradTensor
|
||||
secondaryTensor:gradOutputTensor
|
||||
name:nil];
|
||||
|
||||
newCachedGraph->gradOutputTensor_ = gradOutputTensor;
|
||||
newCachedGraph->inputTensor_ = selfOrResultTensor;
|
||||
newCachedGraph->gradInputTensor_ = gradInputTensor;
|
||||
});
|
||||
|
||||
Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output, nil, executeGatherOp);
|
||||
Placeholder selfOrResultPlaceholder = Placeholder(cachedGraph->inputTensor_, self_or_result, nil, executeGatherOp);
|
||||
Placeholder gradInputPlaceholder =
|
||||
Placeholder(cachedGraph->gradInputTensor_, out.has_storage() ? out : grad_input, nil, false);
|
||||
|
||||
auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, selfOrResultPlaceholder);
|
||||
runMPSGraph(stream, cachedGraph->graph(), feeds, gradInputPlaceholder);
|
||||
if (out.has_storage()) {
|
||||
grad_input.copy_(out);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
TORCH_IMPL_FUNC(glu_out_mps)(const Tensor& self, const int64_t dim, const Tensor& output) {
|
||||
using namespace mps;
|
||||
using CachedGraph = MPSUnaryCachedGraph;
|
||||
|
||||
@ -1,8 +1,10 @@
|
||||
#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
|
||||
#include <ATen/Dispatch.h>
|
||||
#include <ATen/TensorIterator.h>
|
||||
#include <ATen/mps/MPSProfiler.h>
|
||||
#include <ATen/native/Activation.h>
|
||||
#include <ATen/native/mps/OperationUtils.h>
|
||||
#include <ATen/native/mps/kernels/Activation.h>
|
||||
#include <fmt/format.h>
|
||||
|
||||
namespace at::native {
|
||||
@ -41,6 +43,30 @@ static void hardswish_backward_kernel(at::TensorIterator& iter) {
|
||||
lib.exec_binary_kernel(iter, "hardswish_backward");
|
||||
}
|
||||
|
||||
static void elu_kernel(TensorIteratorBase& iter, const Scalar& alpha, const Scalar& scale, const Scalar& input_scale) {
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(c10::kHalf, c10::kBFloat16, iter.common_dtype(), "elu_mps", [&]() {
|
||||
ELUParams<scalar_t> params{alpha.to<scalar_t>(), scale.to<scalar_t>(), input_scale.to<scalar_t>()};
|
||||
lib.exec_unary_kernel_with_params(
|
||||
iter, "elu", params, fmt::format("ELUParams_{}", mps::scalarToMetalTypeString(iter.common_dtype())));
|
||||
});
|
||||
}
|
||||
|
||||
static void elu_backward_kernel(TensorIteratorBase& iter,
|
||||
const Scalar& alpha,
|
||||
const Scalar& scale,
|
||||
const Scalar& input_scale,
|
||||
bool is_result) {
|
||||
AT_DISPATCH_FLOATING_TYPES_AND2(c10::kHalf, c10::kBFloat16, iter.common_dtype(), "elu_backward_mps", [&]() {
|
||||
ELUBackwardParams<scalar_t> params{
|
||||
alpha.to<scalar_t>(), scale.to<scalar_t>(), input_scale.to<scalar_t>(), is_result};
|
||||
lib.exec_binary_kernel_with_params(
|
||||
iter,
|
||||
"elu_backward",
|
||||
params,
|
||||
fmt::format("ELUBackwardParams_{}", mps::scalarToMetalTypeString(iter.common_dtype())));
|
||||
});
|
||||
}
|
||||
|
||||
static void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negative_slope) {
|
||||
lib.exec_unary_kernel(iter, "leaky_relu", negative_slope);
|
||||
}
|
||||
@ -56,6 +82,8 @@ REGISTER_DISPATCH(hardsigmoid_stub, hardsigmoid_kernel);
|
||||
REGISTER_DISPATCH(hardsigmoid_backward_stub, hardsigmoid_backward_kernel);
|
||||
REGISTER_DISPATCH(hardswish_stub, hardswish_kernel);
|
||||
REGISTER_DISPATCH(hardswish_backward_stub, hardswish_backward_kernel);
|
||||
REGISTER_DISPATCH(elu_stub, elu_kernel);
|
||||
REGISTER_DISPATCH(elu_backward_stub, elu_backward_kernel);
|
||||
REGISTER_DISPATCH(leaky_relu_stub, leaky_relu_kernel);
|
||||
REGISTER_DISPATCH(leaky_relu_backward_stub, leaky_relu_backward_kernel);
|
||||
|
||||
|
||||
@ -12064,8 +12064,7 @@
|
||||
device_check: NoCheck # TensorIterator
|
||||
python_module: nn
|
||||
dispatch:
|
||||
CPU, CUDA: elu_out
|
||||
MPS: elu_out_mps
|
||||
CPU, CUDA, MPS: elu_out
|
||||
|
||||
- func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor
|
||||
structured_delegate: elu.out
|
||||
@ -12078,8 +12077,7 @@
|
||||
structured_inherits: TensorIteratorBase
|
||||
python_module: nn
|
||||
dispatch:
|
||||
CPU, CUDA: elu_backward_out
|
||||
MPS: elu_backward_out_mps
|
||||
CPU, CUDA, MPS: elu_backward_out
|
||||
|
||||
- func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result) -> Tensor
|
||||
structured_delegate: elu_backward.grad_input
|
||||
|
||||
@ -828,9 +828,6 @@ inductor_one_sample["cuda"] = {
|
||||
"nn.functional.fractional_max_pool3d": {f16, f32, f64},
|
||||
"nn.functional.group_norm": {f16},
|
||||
"nn.functional.hinge_embedding_loss": {f16},
|
||||
# Enabling all tests for this test fails randomly
|
||||
# See https://github.com/pytorch/pytorch/issues/129238
|
||||
"nn.functional.huber_loss": {f16},
|
||||
"nn.functional.interpolate.bicubic": {f16},
|
||||
"nn.functional.interpolate.bilinear": {f16},
|
||||
"nn.functional.interpolate.trilinear": {f16},
|
||||
@ -948,9 +945,6 @@ inductor_one_sample["xpu"] = {
|
||||
"nn.functional.fractional_max_pool3d": {f16, f32, f64},
|
||||
"nn.functional.group_norm": {f16},
|
||||
"nn.functional.hinge_embedding_loss": {f16},
|
||||
# Enabling all tests for this test fails randomly
|
||||
# See https://github.com/pytorch/pytorch/issues/129238
|
||||
"nn.functional.huber_loss": {f16},
|
||||
"nn.functional.interpolate.bicubic": {f16},
|
||||
"nn.functional.interpolate.bilinear": {f16},
|
||||
"nn.functional.interpolate.trilinear": {f16},
|
||||
|
||||
2
third_party/fbgemm
vendored
2
third_party/fbgemm
vendored
Submodule third_party/fbgemm updated: c0b988d39a...e1fc43e34e
@ -421,7 +421,7 @@ RESET_GRAD_ACCUMULATOR = {"set_", "resize_"}
|
||||
# inplace or out-variants)
|
||||
# If the function does not modify its arguments, we also check the following properties
|
||||
# pertaining to its output:
|
||||
# 2) Its TensorImpl has use_count of 1
|
||||
# 2) Its TensorImpl has use_count of 1 (or 2 if it has a PyObject)
|
||||
# 3) If the function is a view function, it has the same StorageImpl as that of
|
||||
# the input it is aliased with. Otherwise, its StorageImpl has use_count of 1
|
||||
#
|
||||
@ -496,10 +496,10 @@ if (${tensor_name}_impl_saved && !at::impl::dispatch_mode_enabled() && !at::impl
|
||||
"""
|
||||
)
|
||||
|
||||
ENFORCE_TENSOR_IMPL_USE_COUNT_LT_OR_EQ_ONE = CodeTemplate(
|
||||
ENFORCE_TENSOR_IMPL_USE_COUNT = CodeTemplate(
|
||||
"""\
|
||||
if (!at::impl::dispatch_mode_enabled() && !at::impl::tensor_has_dispatch(${tensor_name}))
|
||||
TORCH_INTERNAL_ASSERT(${tensor_name}.use_count() <= 1, "function: ${fn_name}");
|
||||
TORCH_INTERNAL_ASSERT(${tensor_name}.use_count() == expected_fresh_use_count(${tensor_name}), "function: ${fn_name}");
|
||||
"""
|
||||
)
|
||||
|
||||
@ -1664,7 +1664,7 @@ def emit_body(
|
||||
|
||||
if type_wrapper_name(f) not in DONT_ENFORCE_TENSOR_IMPL_USE_COUNT:
|
||||
stmts_after_call += [
|
||||
ENFORCE_TENSOR_IMPL_USE_COUNT_LT_OR_EQ_ONE.substitute(
|
||||
ENFORCE_TENSOR_IMPL_USE_COUNT.substitute(
|
||||
tensor_name=ret_name, fn_name=type_wrapper_name(f)
|
||||
)
|
||||
]
|
||||
|
||||
@ -47,6 +47,18 @@ namespace{
|
||||
meta->grad_accumulator_.reset();
|
||||
}
|
||||
}
|
||||
[[maybe_unused]] size_t expected_fresh_use_count(const Variable& self) {
|
||||
if (!self.defined()) {
|
||||
// An UndefinedTensorImpl always has a use count of 0
|
||||
return 0;
|
||||
}
|
||||
if (self.unsafeGetTensorImpl()->pyobj_slot()->load_pyobj() != nullptr) {
|
||||
// A TensorImpl with a Python object has a use count of 2
|
||||
return 2;
|
||||
}
|
||||
// A fresh TensorImpl (with no PyObject) has a use count of 1
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
@ -138,7 +138,7 @@ inline void PyErr_SetString(PyObject* type, const std::string& message) {
|
||||
throw; \
|
||||
} \
|
||||
} \
|
||||
catch (const std::exception& e) { \
|
||||
catch (const std::exception&) { \
|
||||
torch::translate_exception_to_python(std::current_exception()); \
|
||||
return retval; \
|
||||
}
|
||||
|
||||
@ -81,7 +81,7 @@ c10::intrusive_ptr<Backend> ProcessGroup::getBackend(
|
||||
ProcessGroup::BackendType backendType{ProcessGroup::BackendType::UNDEFINED};
|
||||
try {
|
||||
backendType = deviceTypeToBackendType_.at(deviceType);
|
||||
} catch (const std::out_of_range& e) {
|
||||
} catch (const std::out_of_range&) {
|
||||
TORCH_CHECK(
|
||||
false, "No backend type associated with device type ", deviceType);
|
||||
}
|
||||
|
||||
@ -246,7 +246,7 @@ class UvTcpServer : public UvTcpSocket {
|
||||
uv_err_name(uv_res),
|
||||
uv_strerror(uv_res)));
|
||||
res->cacheSocketPort();
|
||||
} catch (std::exception& ex) {
|
||||
} catch (std::exception&) {
|
||||
res->close();
|
||||
throw;
|
||||
}
|
||||
@ -322,7 +322,7 @@ class UvTcpServer : public UvTcpSocket {
|
||||
uv_err_name(uv_res),
|
||||
uv_strerror(uv_res)));
|
||||
res->cacheSocketPort();
|
||||
} catch (std::exception& ex) {
|
||||
} catch (std::exception&) {
|
||||
res->close();
|
||||
throw;
|
||||
}
|
||||
|
||||
@ -353,7 +353,7 @@ static PyObject* NodeBase__update_args_kwargs(
|
||||
Py_CLEAR(node->_kwargs);
|
||||
node->_kwargs = map_aggregate(args[1], visit_fn);
|
||||
Py_RETURN_NONE;
|
||||
} catch (const PythonError& e) {
|
||||
} catch (const PythonError&) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
@ -397,7 +397,7 @@ static PyObject* NodeBase__replace_input_with(
|
||||
|
||||
PyObject* update_args[2] = {new_args.get(), new_kwargs.get()};
|
||||
return NodeBase__update_args_kwargs(self, update_args, 2);
|
||||
} catch (const PythonError& e) {
|
||||
} catch (const PythonError&) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
@ -802,7 +802,7 @@ static PyObject* py_map_aggregate(
|
||||
// args[0]: aggregate, args[1]: callable fn
|
||||
return map_aggregate(
|
||||
args[0], [fn](PyObject* a) { return PyObject_CallOneArg(fn, a); });
|
||||
} catch (const PythonError& e) {
|
||||
} catch (const PythonError&) {
|
||||
return nullptr; // error should already be set
|
||||
}
|
||||
}
|
||||
@ -824,7 +824,7 @@ static PyObject* py_map_arg(
|
||||
}
|
||||
return Py_NewRef(a);
|
||||
});
|
||||
} catch (const PythonError& e) {
|
||||
} catch (const PythonError&) {
|
||||
return nullptr; // error should already be set
|
||||
}
|
||||
}
|
||||
|
||||
@ -117,7 +117,7 @@ struct type_caster<torch::jit::IValue> {
|
||||
try {
|
||||
value = torch::jit::toTypeInferredIValue(src);
|
||||
return true;
|
||||
} catch (std::exception& e) {
|
||||
} catch (std::exception&) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@ -142,7 +142,7 @@ struct type_caster<torch::jit::Symbol> {
|
||||
std::string src_str;
|
||||
try {
|
||||
src_str = py::cast<std::string>(src);
|
||||
} catch (std::exception& e) {
|
||||
} catch (std::exception&) {
|
||||
return false;
|
||||
}
|
||||
value = torch::jit::Symbol::fromQualString(src_str);
|
||||
|
||||
@ -285,7 +285,7 @@ struct FromImpl<torch::headeronly::HeaderOnlyArrayRef<T>> {
|
||||
torch_list_push_back(new_list_handle, from(elem)));
|
||||
}
|
||||
return from(new_list_handle);
|
||||
} catch (const std::runtime_error& e) {
|
||||
} catch (const std::runtime_error&) {
|
||||
if (new_list_handle != nullptr) {
|
||||
// clean up memory if an error was thrown
|
||||
TORCH_ERROR_CODE_CHECK(torch_delete_list(new_list_handle));
|
||||
@ -553,7 +553,7 @@ struct ToImpl<std::vector<T>> {
|
||||
}
|
||||
TORCH_ERROR_CODE_CHECK(torch_delete_list(list_handle));
|
||||
return result;
|
||||
} catch (const std::runtime_error& e) {
|
||||
} catch (const std::runtime_error&) {
|
||||
// clean up memory if an exception is thrown, and rethrow
|
||||
TORCH_ERROR_CODE_CHECK(torch_delete_list(list_handle));
|
||||
throw;
|
||||
|
||||
Reference in New Issue
Block a user