Automated submodule update: FBGEMM

[MPS] Move elu impl to Metal (#166903 )
Pull Request resolved: https://github.com/pytorch/pytorch/pull/166903 Approved by: https://github.com/malfet
2025-11-19 10:04:58 +08:00 · 2025-11-18 14:43:11 -08:00 · 2025-11-18 22:32:00 +00:00 · 2025-11-18 22:04:53 +00:00 · 2025-11-18 22:02:02 +00:00 · 2025-11-18 20:21:48 +00:00
35 changed files with 416 additions and 343 deletions
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -125,10 +125,10 @@ case "$tag" in
    UCC_COMMIT=${_UCC_COMMIT}
    TRITON=yes
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks)
    CUDA_VERSION=12.8.1
    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
+    GCC_VERSION=11
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -146,16 +146,6 @@ case "$tag" in
    UCC_COMMIT=${_UCC_COMMIT}
    TRITON=yes
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.8.1
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    ;;
  pytorch-linux-jammy-py3-clang12-onnx)
    ANACONDA_PYTHON_VERSION=3.10
    CLANG_VERSION=12
--- a/.github/workflows/attention_op_microbenchmark.yml
+++ b/.github/workflows/attention_op_microbenchmark.yml
@ -23,7 +23,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      runner: linux.12xlarge.memory
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '8.0 9.0'
      test-matrix: |
@ -39,7 +39,7 @@ jobs:
    needs: attn-microbenchmark-build
    with:
      timeout-minutes: 500
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
      docker-image: ${{ needs.attn-microbenchmark-build.outputs.docker-image }}
      test-matrix: ${{ needs.attn-microbenchmark-build.outputs.test-matrix }}
    secrets: inherit
@ -51,7 +51,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      runner: linux.12xlarge.memory
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '10.0'
      test-matrix: |
@ -66,7 +66,7 @@ jobs:
    needs: opmicrobenchmark-build-b200
    with:
      timeout-minutes: 500
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
      docker-image: ${{ needs.opmicrobenchmark-build-b200.outputs.docker-image }}
      test-matrix: ${{ needs.opmicrobenchmark-build-b200.outputs.test-matrix }}
      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -52,8 +52,7 @@ jobs:
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11,
          pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm,
-          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks,
-          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks,
          pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
          pytorch-linux-jammy-py3.10-clang12,
          pytorch-linux-jammy-py3.11-clang12,
--- a/.github/workflows/docker-cache-rocm.yml
+++ b/.github/workflows/docker-cache-rocm.yml
@ -50,9 +50,10 @@ jobs:
      matrix:
        runner: [linux.rocm.gfx942.docker-cache]
        docker-image: [
-          "${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3 }}",
-          "${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-noble-rocm-n-py3 }}",
-          "${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3-benchmarks }}"
+          "${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3 }}"
+          #"${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3 }}",
+          #"${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-noble-rocm-n-py3 }}",
+          #"${{ needs.download-docker-builds-artifacts.outputs.pytorch-linux-jammy-rocm-n-py3-benchmarks }}"
        ]
    runs-on: "${{ matrix.runner }}"
    steps:
--- a/.github/workflows/inductor-micro-benchmark.yml
+++ b/.github/workflows/inductor-micro-benchmark.yml
@ -30,14 +30,14 @@ jobs:
      opt_out_experiments: lf

  build:
-    name: cuda12.8-py3.10-gcc9-sm80
+    name: cuda12.8-py3.10-gcc11-sm80
    uses: ./.github/workflows/_linux-build.yml
    needs:
      - get-default-label-prefix
    with:
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
      cuda-arch-list: '8.0'
      test-matrix: |
        { include: [
@ -46,11 +46,11 @@ jobs:
    secrets: inherit

  test:
-    name: cuda12.8-py3.10-gcc9-sm80
+    name: cuda12.8-py3.10-gcc11-sm80
    uses: ./.github/workflows/_linux-test.yml
    needs: build
    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
      docker-image: ${{ needs.build.outputs.docker-image }}
      test-matrix: ${{ needs.build.outputs.test-matrix }}
      timeout-minutes: 720
--- a/.github/workflows/inductor-perf-compare.yml
+++ b/.github/workflows/inductor-perf-compare.yml
@ -27,14 +27,14 @@ jobs:
      opt_out_experiments: lf

  build:
-    name: cuda12.8-py3.10-gcc9-sm80
+    name: cuda12.8-py3.10-gcc11-sm80
    uses: ./.github/workflows/_linux-build.yml
    needs:
      - get-default-label-prefix
    with:
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
      cuda-arch-list: '8.0'
      test-matrix: |
        { include: [
@ -47,11 +47,11 @@ jobs:
    secrets: inherit

  test:
-    name: cuda12.8-py3.10-gcc9-sm80
+    name: cuda12.8-py3.10-gcc11-sm80
    uses: ./.github/workflows/_linux-test.yml
    needs: build
    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
      docker-image: ${{ needs.build.outputs.docker-image }}
      test-matrix: ${{ needs.build.outputs.test-matrix }}
      # disable monitor in perf tests for more investigation
--- a/.github/workflows/inductor-perf-test-b200.yml
+++ b/.github/workflows/inductor-perf-test-b200.yml
@ -80,7 +80,7 @@ jobs:
      opt_out_experiments: lf

  build:
-    name: cuda12.8-py3.10-gcc9-sm100
+    name: cuda12.8-py3.10-gcc11-sm100
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
@ -90,8 +90,8 @@ jobs:
      # from trunk. Also use a memory-intensive runner here because memory is
      # usually the bottleneck
      runner: linux.12xlarge.memory
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
      cuda-arch-list: '10.0'
      test-matrix: |
        { include: [
@ -104,12 +104,12 @@ jobs:
    secrets: inherit

  test-periodically:
-    name: cuda12.8-py3.10-gcc9-sm100
+    name: cuda12.8-py3.10-gcc11-sm100
    uses: ./.github/workflows/_linux-test.yml
    needs: build
    if: github.event.schedule == '0 7 * * 1-6'
    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
      docker-image: ${{ needs.build.outputs.docker-image }}
      test-matrix: ${{ needs.build.outputs.test-matrix }}
@ -121,12 +121,12 @@ jobs:
    secrets: inherit

  test-weekly:
-    name: cuda12.8-py3.10-gcc9-sm100
+    name: cuda12.8-py3.10-gcc11-sm100
    uses: ./.github/workflows/_linux-test.yml
    needs: build
    if: github.event.schedule == '0 7 * * 0'
    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
      docker-image: ${{ needs.build.outputs.docker-image }}
      test-matrix: ${{ needs.build.outputs.test-matrix }}
@ -138,11 +138,11 @@ jobs:
    secrets: inherit

  test:
-    name: cuda12.8-py3.10-gcc9-sm100
+    name: cuda12.8-py3.10-gcc11-sm100
    uses: ./.github/workflows/_linux-test.yml
    needs: build
    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
      docker-image: ${{ needs.build.outputs.docker-image }}
      test-matrix: ${{ needs.build.outputs.test-matrix }}
--- a/.github/workflows/inductor-perf-test-nightly-h100.yml
+++ b/.github/workflows/inductor-perf-test-nightly-h100.yml
@ -95,8 +95,8 @@ jobs:
      # from trunk. Also use a memory-intensive runner here because memory is
      # usually the bottleneck
      runner: linux.12xlarge.memory
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
      cuda-arch-list: '9.0'
      test-matrix: |
        { include: [
@ -132,7 +132,7 @@ jobs:
    needs: build
    if: github.event.schedule == '15 0 * * 1-6'
    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
      docker-image: ${{ needs.build.outputs.docker-image }}
      test-matrix: ${{ needs.build.outputs.test-matrix }}
@ -149,7 +149,7 @@ jobs:
    needs: build
    if: github.event.schedule == '0 7 * * 0'
    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
      docker-image: ${{ needs.build.outputs.docker-image }}
      test-matrix: ${{ needs.build.outputs.test-matrix }}
@ -168,7 +168,7 @@ jobs:
    # needs one round of benchmark
    if: ${{ github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' }}
    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
      dashboard-tag: training-${{ inputs.training || 'true' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cudagraphs-${{ inputs.cudagraphs || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'false' }}-aotinductor-${{ inputs.aotinductor || 'false' }}-maxautotune-${{ inputs.maxautotune || 'false' }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs || 'false' }}-cudagraphs_low_precision-${{ inputs.cudagraphs || 'false' }}
      docker-image: ${{ needs.build.outputs.docker-image }}
      test-matrix: ${{ needs.build.outputs.test-matrix }}
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@ -80,15 +80,15 @@ jobs:
      opt_out_experiments: lf

  build:
-    name: cuda12.8-py3.10-gcc9-sm80
+    name: cuda12.8-py3.10-gcc11-sm80
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      # Every bit to make perf run faster helps
      runner: linux.12xlarge.memory
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
      cuda-arch-list: '8.0'
      test-matrix: |
        { include: [
@ -117,12 +117,12 @@ jobs:
    secrets: inherit

  test-nightly:
-    name: cuda12.8-py3.10-gcc9-sm80
+    name: cuda12.8-py3.10-gcc11-sm80
    uses: ./.github/workflows/_linux-test.yml
    needs: build
    if: github.event.schedule == '0 7 * * 1-6'
    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
      docker-image: ${{ needs.build.outputs.docker-image }}
      test-matrix: ${{ needs.build.outputs.test-matrix }}
@ -133,12 +133,12 @@ jobs:
    secrets: inherit

  test-weekly:
-    name: cuda12.8-py3.10-gcc9-sm80
+    name: cuda12.8-py3.10-gcc11-sm80
    uses: ./.github/workflows/_linux-test.yml
    needs: build
    if: github.event.schedule == '0 7 * * 0'
    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true
      docker-image: ${{ needs.build.outputs.docker-image }}
      test-matrix: ${{ needs.build.outputs.test-matrix }}
@ -150,12 +150,12 @@ jobs:
    secrets: inherit

  test:
-    name: cuda12.8-py3.10-gcc9-sm80
+    name: cuda12.8-py3.10-gcc11-sm80
    uses: ./.github/workflows/_linux-test.yml
    needs: build
    if: github.event_name == 'workflow_dispatch'
    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
      docker-image: ${{ needs.build.outputs.docker-image }}
      test-matrix: ${{ needs.build.outputs.test-matrix }}
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -37,8 +37,8 @@ jobs:
    needs: get-default-label-prefix
    with:
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
      cuda-arch-list: '8.0;8.6'
      test-matrix: |
        { include: [
@ -76,7 +76,7 @@ jobs:
    uses: ./.github/workflows/_linux-test.yml
    needs: periodic-dynamo-benchmarks-build
    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
      docker-image: ${{ needs.periodic-dynamo-benchmarks-build.outputs.docker-image }}
      test-matrix: ${{ needs.periodic-dynamo-benchmarks-build.outputs.test-matrix }}
    secrets: inherit
@ -138,8 +138,8 @@ jobs:
      - get-default-label-prefix
    with:
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
      cuda-arch-list: '8.0'
      test-matrix: |
        { include: [
@ -153,7 +153,7 @@ jobs:
    uses: ./.github/workflows/_linux-test.yml
    needs: inductor-smoke-build
    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
      docker-image: ${{ needs.inductor-smoke-build.outputs.docker-image }}
      test-matrix: ${{ needs.inductor-smoke-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/inductor-unittest.yml
+++ b/.github/workflows/inductor-unittest.yml
@ -33,8 +33,8 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
      cuda-arch-list: '8.6'
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
@ -52,7 +52,7 @@ jobs:
    uses: ./.github/workflows/_linux-test.yml
    needs: inductor-build
    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -49,8 +49,8 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
      cuda-arch-list: '8.6'
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
@ -69,7 +69,7 @@ jobs:
    uses: ./.github/workflows/_linux-test.yml
    needs: inductor-build
    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm86
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm86
      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/operator_microbenchmark.yml
+++ b/.github/workflows/operator_microbenchmark.yml
@ -25,7 +25,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      runner: linux.12xlarge.memory
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '8.0 9.0'
      test-matrix: |
@ -41,7 +41,7 @@ jobs:
    needs: opmicrobenchmark-build
    with:
      timeout-minutes: 500
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
      docker-image: ${{ needs.opmicrobenchmark-build.outputs.docker-image }}
      test-matrix: ${{ needs.opmicrobenchmark-build.outputs.test-matrix }}
    secrets: inherit
@ -53,7 +53,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      runner: linux.12xlarge.memory
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '10.0'
      test-matrix: |
@ -68,7 +68,7 @@ jobs:
    needs: opmicrobenchmark-build-b200
    with:
      timeout-minutes: 500
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm100
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
      docker-image: ${{ needs.opmicrobenchmark-build-b200.outputs.docker-image }}
      test-matrix: ${{ needs.opmicrobenchmark-build-b200.outputs.test-matrix }}
      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -90,6 +90,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: 8.6
      test-matrix: |
        { include: [
          { config: "nogpu_AVX512", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
@ -97,7 +98,9 @@ jobs:
          { config: "nogpu_AVX512", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "multigpu", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu", owners: ["oncall:distributed"] },
+          { config: "multigpu", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu", owners: ["oncall:distributed"] },
        ]}
    secrets: inherit

@ -113,40 +116,14 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-cuda12_8-py3_10-gcc9-build:
-    name: linux-jammy-cuda12.8-py3.10-gcc9
+  linux-jammy-cuda12_8-py3_10-gcc11-debug-build:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-debug
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9
-      cuda-arch-list: 8.6
-      test-matrix: |
-        { include: [
-          { config: "multigpu", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu", owners: ["oncall:distributed"] },
-          { config: "multigpu", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu", owners: ["oncall:distributed"] },
-        ]}
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_10-gcc9-test:
-    name: linux-jammy-cuda12.8-py3.10-gcc9
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cuda12_8-py3_10-gcc9-build
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-build.outputs.test-matrix }}
-    secrets: inherit
-
-  linux-jammy-cuda12_8-py3_10-gcc9-debug-build:
-    name: linux-jammy-cuda12.8-py3.10-gcc9-debug
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-debug
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-debug
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: 8.9
      test-matrix: |
        { include: [
@ -160,16 +137,16 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-cuda12_8-py3_10-gcc9-debug-test:
-    name: linux-jammy-cuda12.8-py3.10-gcc9-debug
+  linux-jammy-cuda12_8-py3_10-gcc11-debug-test:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-debug
    uses: ./.github/workflows/_linux-test.yml
    needs:
-      - linux-jammy-cuda12_8-py3_10-gcc9-debug-build
+      - linux-jammy-cuda12_8-py3_10-gcc11-debug-build
      - target-determination
    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-debug
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-debug-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-debug-build.outputs.test-matrix }}
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-debug
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-debug-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-debug-build.outputs.test-matrix }}
    secrets: inherit

  linux-jammy-cuda13_0-py3_10-gcc11-build:
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -318,14 +318,14 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
-    name: cuda12.8-py3.10-gcc9-sm75
+  linux-jammy-cuda12_8-py3_10-gcc11-inductor-build:
+    name: cuda12.8-py3.10-gcc11-sm75
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm75
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
      cuda-arch-list: '7.5'
      test-matrix: |
        { include: [
@ -333,14 +333,14 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-cuda12_8-py3_10-gcc9-inductor-test:
-    name: cuda12.8-py3.10-gcc9-sm75
+  linux-jammy-cuda12_8-py3_10-gcc11-inductor-test:
+    name: cuda12.8-py3.10-gcc11-sm75
    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cuda12_8-py3_10-gcc9-inductor-build
+    needs: linux-jammy-cuda12_8-py3_10-gcc11-inductor-build
    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm75
-      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm75
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-inductor-build.outputs.test-matrix }}
    secrets: inherit

  linux-noble-xpu-n-py3_10-build:
--- a/.github/workflows/torchbench.yml
+++ b/.github/workflows/torchbench.yml
@ -26,14 +26,14 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}

  build:
-    name: cuda12.8-py3.10-gcc9-sm80
+    name: cuda12.8-py3.10-gcc11-sm80
    uses: ./.github/workflows/_linux-build.yml
    needs:
      - get-default-label-prefix
    with:
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
      cuda-arch-list: '8.0'
      test-matrix: |
        { include: [
@ -42,11 +42,11 @@ jobs:
    secrets: inherit

  test:
-    name: cuda12.8-py3.10-gcc9-sm80
+    name: cuda12.8-py3.10-gcc11-sm80
    uses: ./.github/workflows/_linux-test.yml
    needs: build
    with:
-      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm80
      docker-image: ${{ needs.build.outputs.docker-image }}
      test-matrix: ${{ needs.build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -231,8 +231,8 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      build-environment: linux-jammy-cuda12.8-py3.12-gcc9-sm80
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks
+      build-environment: linux-jammy-cuda12.8-py3.12-gcc11-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11-inductor-benchmarks
      cuda-arch-list: '8.0'
    secrets: inherit

--- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp
@ -813,8 +813,43 @@ void smooth_l1_kernel(TensorIteratorBase& iter, double beta) {
 }

 void huber_kernel(TensorIterator& iter, double delta) {
-  AT_DISPATCH_FLOATING_TYPES_AND2(
-      kBFloat16, kHalf, iter.dtype(), "huber_cpu", [&]() {
+  // Special-case kHalf: compute in float for numerical stability
+  if (iter.dtype() == kHalf) {
+    const float delta_val(static_cast<float>(delta));
+    const Vectorized<float> delta_vec(static_cast<float>(delta));
+    const Vectorized<float> point_five_vec(static_cast<float>(0.5));
+    cpu_kernel_vec(
+      iter,
+      // scalar lambda: convert half -> float, compute in float, cast back to half
+      [&delta_val] (at::Half a, at::Half b) -> at::Half {
+        float af = static_cast<float>(a);
+        float bf = static_cast<float>(b);
+        float z = std::abs(af - bf);
+        float out = z < delta_val
+          ? 0.5f * z * z
+          : delta_val * (z - 0.5f * delta_val);
+        return static_cast<at::Half>(out);
+      },
+      [&delta_vec, &point_five_vec] (Vectorized<Half> a, Vectorized<Half> b) {
+        auto [a0, a1] = convert_half_float(a);
+        auto [b0, b1] = convert_half_float(b);
+        auto z = (a0 - b0).abs();
+        a0 = Vectorized<float>::blendv(
+          point_five_vec * z * z,
+          delta_vec * (z - point_five_vec * delta_vec),
+          z >= delta_vec);
+        z = (a1 - b1).abs();
+        a1 = Vectorized<float>::blendv(
+          point_five_vec * z * z,
+          delta_vec * (z - point_five_vec * delta_vec),
+          z >= delta_vec);
+        return convert_float_half(a0, a1);
+      }
+    );
+    return;
+  }
+  else {
+   AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16, iter.dtype(), "huber_cpu", [&]() {
        using Vec = Vectorized<scalar_t>;
        const scalar_t delta_val(delta);
        const Vec delta_val_vec(delta_val);
@ -835,6 +870,7 @@ void huber_kernel(TensorIterator& iter, double delta) {
                  z >= delta_val_vec);
            });
      });
+  }
 }

 void sigmoid_backward_kernel(TensorIteratorBase& iter) {
--- a/aten/src/ATen/native/mps/MetalShaderLibrary.h
+++ b/aten/src/ATen/native/mps/MetalShaderLibrary.h
@ -147,6 +147,19 @@ class MetalShaderLibrary {
      const std::optional<c10::Scalar> alpha = std::nullopt,
      const std::optional<c10::ScalarType> scalar_arg_type = std::nullopt);

+  template <typename T>
+  void exec_unary_kernel_with_params(
+      TensorIteratorBase& iter,
+      const std::string& name,
+      T params,
+      const std::string& params_type_name);
+  template <typename T>
+  void exec_binary_kernel_with_params(
+      TensorIteratorBase& iter,
+      const std::string& name,
+      T params,
+      const std::string& params_type_name);
+
 protected:
  virtual MTLLibrary_t getLibrary();
  virtual MTLLibrary_t getLibrary(
--- a/aten/src/ATen/native/mps/OperationUtils.h
+++ b/aten/src/ATen/native/mps/OperationUtils.h
@ -7,10 +7,12 @@
 #include <ATen/Tensor.h>
 #include <ATen/TensorIterator.h>
 #include <ATen/Utils.h>
+#include <ATen/mps/MPSProfiler.h>
 #include <ATen/mps/MPSStream.h>
 #include <ATen/native/mps/MetalShaderLibrary.h>
 #include <ATen/native/mps/TensorFactory.h>
 #include <c10/core/ScalarType.h>
+#include <fmt/format.h>
 #include <torch/library.h>
 #include <unordered_map>

@ -630,4 +632,147 @@ inline bool needsGather(const TensorBase& t) {
  return !is_macOS_15_0_or_newer && (!t.is_contiguous() || t.storage_offset());
 }

+template <typename T>
+void MetalShaderLibrary::exec_unary_kernel_with_params(TensorIteratorBase& iter,
+                                                       const std::string& name,
+                                                       T params,
+                                                       const std::string& params_type_name) {
+  using namespace at::mps;
+  // Decompose 64-bit tensor into 32-bit ones
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto&& sub_iter : iter.with_32bit_indexing()) {
+      exec_unary_kernel_with_params(sub_iter, name, params, params_type_name);
+    }
+    return;
+  }
+
+  auto inputTensor = iter.input(0);
+  auto outputTensor = iter.output(0);
+  uint32_t length = iter.numel();
+  if (length == 0) {
+    return;
+  }
+  auto kernel_name = fmt::format("{}_{}_{}_{}{}",
+                                 name,
+                                 iter.is_contiguous() ? "dense" : "strided",
+                                 scalarToMetalTypeString(outputTensor),
+                                 scalarToMetalTypeString(inputTensor),
+                                 fmt::format("_{}", params_type_name));
+  @autoreleasepool {
+    auto cplState = getPipelineStateForFunc(kernel_name);
+
+    MPSStream* mpsStream = getCurrentMPSStream();
+    dispatch_sync(mpsStream->queue(), ^() {
+      auto computeEncoder = mpsStream->commandEncoder();
+
+      getMPSProfiler().beginProfileKernel(cplState, name, {inputTensor});
+
+      [computeEncoder setComputePipelineState:cplState];
+      bind_iter_tensors(computeEncoder, iter);
+      if (!iter.is_contiguous()) {
+        mtl_setArgs<2>(computeEncoder,
+                       outputTensor.sizes(),
+                       inputTensor.strides(),
+                       outputTensor.strides(),
+                       inputTensor.ndimension());
+      }
+      detail::mtl_setArg(computeEncoder, params, iter.is_contiguous() ? 2 : 6);
+      mtl_dispatch1DJob(computeEncoder, cplState, length);
+
+      getMPSProfiler().endProfileKernel(cplState);
+    });
+  }
+}
+
+template <typename T>
+void MetalShaderLibrary::exec_binary_kernel_with_params(TensorIteratorBase& iter,
+                                                        const std::string& name,
+                                                        T params,
+                                                        const std::string& params_type_name) {
+  using namespace mps;
+  // TODO: Figure a better place to downcast double scalars (probably in tensor iterator itself?)
+  // Right now running something like 1.0-torch.rand(5, device='mps') will create iterator with
+  // double as common dtype (because Python floating point are always 64-bit values)
+  TORCH_CHECK(iter.output().scalar_type() != at::kDouble, "float64 is not supported on MPS");
+
+  // Skip for empty iterators
+  if (iter.numel() == 0) {
+    return;
+  }
+
+  // Decompose 64-bit tensor into 32-bit ones
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto&& sub_iter : iter.with_32bit_indexing()) {
+      exec_binary_kernel_with_params(sub_iter, name, params, params_type_name);
+    }
+    return;
+  }
+
+  auto convert_double_scalar = [](Tensor& t) {
+    if (t.dim() != 0) {
+      return;
+    }
+    if (t.scalar_type() == kDouble) {
+      t = t.to(kFloat);
+    } else if (t.scalar_type() == kComplexDouble) {
+      t = t.to(kComplexFloat);
+    }
+  };
+
+  Tensor input = iter.input(0);
+  Tensor other = iter.input(1);
+  Tensor out = iter.output();
+
+  convert_double_scalar(input);
+  convert_double_scalar(other);
+
+  MPSStream* mpsStream = getCurrentMPSStream();
+  const auto cast_needed = input.scalar_type() != other.scalar_type();
+  const auto suffix = iter.is_contiguous() ? "dense" : "strided";
+  // TODO: Implicitly pass both input and output types to non-cast kernels
+  const auto kernel_name = cast_needed
+      ? fmt::format("{}_{}_cast_{}_{}", name, suffix, scalarToMetalTypeString(out), params_type_name)
+      : fmt::format("{}_{}_{}_{}_{}",
+                    name,
+                    suffix,
+                    scalarToMetalTypeString(out),
+                    scalarToMetalTypeString(input),
+                    params_type_name);
+  dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      auto computeEncoder = mpsStream->commandEncoder();
+      auto binaryPSO = getPipelineStateForFunc(kernel_name);
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(binaryPSO, kernel_name, {input, other});
+      [computeEncoder setComputePipelineState:binaryPSO];
+      // Set input and output tensors
+      bind_iter_tensors(computeEncoder, iter);
+      // Iterator is contiguous if all of its elements are dense in storage,
+      // i.e. it's true for both row-first and column-first tensors
+      if (iter.is_contiguous()) {
+        detail::mtl_setArg(computeEncoder, params, 3);
+        if (cast_needed) {
+          std::array<int, 4> size_and_types = {static_cast<int>(c10::elementSize(input.scalar_type())),
+                                               static_cast<int>(c10::elementSize(other.scalar_type())),
+                                               static_cast<int>(input.scalar_type()),
+                                               static_cast<int>(other.scalar_type())};
+          mtl_setBytes(computeEncoder, size_and_types, 4);
+        }
+      } else {
+        // Please note that shapes and strides of the iterator might be
+        // different than that of its operands, for example binary op
+        // between 4x4 tensor and scalar will result in 1D 16 element iterator
+        std::array<int, 4> ndim_and_types = {iter.ndim(),
+                                             static_cast<int>(input.scalar_type()),
+                                             static_cast<int>(other.scalar_type()),
+                                             static_cast<int>(out.scalar_type())};
+        mtl_setArgs<3>(
+            computeEncoder, params, iter.shape(), iter.strides(0), iter.strides(1), iter.strides(2), ndim_and_types);
+      }
+      mtl_dispatch1DJob(computeEncoder, binaryPSO, iter.numel());
+      getMPSProfiler().endProfileKernel(binaryPSO);
+    }
+  });
+}
+
 } // namespace at::native::mps
--- a/aten/src/ATen/native/mps/kernels/Activation.h
+++ b/aten/src/ATen/native/mps/kernels/Activation.h
@ -0,0 +1,16 @@
+#pragma once
+
+template <typename T>
+struct ELUParams {
+  T alpha;
+  T scale;
+  T input_scale;
+};
+
+template <typename T>
+struct ELUBackwardParams {
+  T alpha;
+  T scale;
+  T input_scale;
+  bool is_result;
+};
--- a/aten/src/ATen/native/mps/kernels/ActivationKernel.metal
+++ b/aten/src/ATen/native/mps/kernels/ActivationKernel.metal
@ -1,3 +1,4 @@
+#include <ATen/native/mps/kernels/Activation.h>
 #include <c10/metal/indexing.h>
 #include <c10/metal/special_math.h>
 #include <metal_stdlib>
@ -99,6 +100,59 @@ REGISTER_BINARY_OP(hardswish_backward, float, float);
 REGISTER_BINARY_OP(hardswish_backward, half, half);
 REGISTER_BINARY_OP(hardswish_backward, bfloat, bfloat);

+struct elu_functor {
+  template <typename T>
+  inline T operator()(const T self_, const ELUParams<T> params) {
+    using op_T = opmath_t<T>;
+    auto alpha = static_cast<op_T>(params.alpha);
+    auto scale = static_cast<op_T>(params.scale);
+    auto input_scale = static_cast<op_T>(params.input_scale);
+    auto self = static_cast<op_T>(self_);
+    auto neg_res = alpha * (::metal::precise::exp(self * input_scale) - 1);
+    return static_cast<T>(scale * (self < 0 ? neg_res : self));
+  }
+};
+
+struct elu_backward_functor {
+  template <typename T>
+  inline T operator()(
+      const T grad_output_,
+      const T self_,
+      ELUBackwardParams<T> params) {
+    using op_T = opmath_t<T>;
+    auto alpha = static_cast<op_T>(params.alpha);
+    auto scale = static_cast<op_T>(params.scale);
+    auto input_scale = static_cast<op_T>(params.input_scale);
+    auto grad_output = static_cast<op_T>(grad_output_);
+    auto self = static_cast<op_T>(self_);
+
+    if (params.is_result) {
+      auto neg_coef = input_scale * (self + alpha * scale);
+      return static_cast<T>(grad_output * (self <= 0 ? neg_coef : scale));
+    } else {
+      auto neg_coef = input_scale * alpha * scale *
+          ::metal::precise::exp(self * input_scale);
+      return static_cast<T>(grad_output * (self <= 0 ? neg_coef : scale));
+    }
+  }
+};
+
+#define REGISTER_ELU_OP(T)            \
+  typedef ELUParams<T> ELUParams_##T; \
+  REGISTER_UNARY_ALPHA_OP(elu, T, ELUParams_##T, T);
+
+REGISTER_ELU_OP(float);
+REGISTER_ELU_OP(half);
+REGISTER_ELU_OP(bfloat);
+
+#define REGISTER_ELU_BACKWARD_OP(T)                   \
+  typedef ELUBackwardParams<T> ELUBackwardParams_##T; \
+  REGISTER_BINARY_ALPHA_OP(elu_backward, T, ELUBackwardParams_##T, T);
+
+REGISTER_ELU_BACKWARD_OP(float);
+REGISTER_ELU_BACKWARD_OP(half);
+REGISTER_ELU_BACKWARD_OP(bfloat);
+
 struct leaky_relu_functor {
  template <typename T>
  inline T operator()(const T x, const T negative_slope) {
--- a/aten/src/ATen/native/mps/operations/Activation.mm
+++ b/aten/src/ATen/native/mps/operations/Activation.mm
@ -11,8 +11,6 @@
 #include <ATen/ops/_log_softmax_native.h>
 #include <ATen/ops/_prelu_kernel_backward_native.h>
 #include <ATen/ops/_prelu_kernel_native.h>
-#include <ATen/ops/elu_backward_native.h>
-#include <ATen/ops/elu_native.h>
 #include <ATen/ops/gelu_backward_native.h>
 #include <ATen/ops/gelu_native.h>
 #include <ATen/ops/glu_backward_native.h>
@ -698,194 +696,6 @@ TORCH_IMPL_FUNC(gelu_backward_out_mps)
  }
 }

-static void elu_variants_out_mps(const Tensor& self,
-                                 const Scalar& alpha,
-                                 const Scalar& scale,
-                                 const Scalar& input_scale,
-                                 const Tensor& result,
-                                 std::string func_name) {
-  using namespace mps;
-  using CachedGraph = MPSUnaryCachedGraph;
-
-  auto resultMemFormat = result.suggest_memory_format();
-  bool executeGatherOp = !(self.is_contiguous(resultMemFormat) && result.is_contiguous(resultMemFormat));
-  Tensor out;
-  if (executeGatherOp) {
-    out = at::empty_like(result, MemoryFormat::Contiguous);
-  }
-
-  // Empty output
-  if (result.numel() == 0) {
-    return;
-  }
-
-  MPSStream* stream = getCurrentMPSStream();
-
-  @autoreleasepool {
-    std::string key = func_name + ":" + getTensorsStringKey({self}) + ":" + std::to_string(alpha.to<double>()) + ":" +
-        std::to_string(scale.to<double>()) + ":" + std::to_string(input_scale.to<double>());
-
-    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
-      MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self);
-
-      // scale * (max(0, x) + min(0, alpha * (exp(input_scale * x) - 1) ))
-
-      MPSGraphTensor* alphaTensor = [mpsGraph constantWithScalar:alpha.to<double>()
-                                                           shape:@[ @1 ]
-                                                        dataType:getMPSDataType(self)];
-
-      MPSGraphTensor* inputScaleTensor = [mpsGraph constantWithScalar:input_scale.to<double>()
-                                                                shape:@[ @1 ]
-                                                             dataType:getMPSDataType(self)];
-
-      MPSGraphTensor* scaleTensor = [mpsGraph constantWithScalar:scale.to<double>()
-                                                           shape:@[ @1 ]
-                                                        dataType:getMPSDataType(self)];
-      MPSGraphTensor* unitTensor = [mpsGraph constantWithScalar:1.0f shape:@[ @1 ] dataType:getMPSDataType(self)];
-      MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0f shape:@[ @1 ] dataType:getMPSDataType(self)];
-
-      MPSGraphTensor* scaledInputTensor = [mpsGraph multiplicationWithPrimaryTensor:inputTensor
-                                                                    secondaryTensor:inputScaleTensor
-                                                                               name:nil];
-      MPSGraphTensor* exponentTensor = [mpsGraph exponentWithTensor:scaledInputTensor name:nil];
-      MPSGraphTensor* exponentMinusOneTensor = [mpsGraph subtractionWithPrimaryTensor:exponentTensor
-                                                                      secondaryTensor:unitTensor
-                                                                                 name:nil];
-      MPSGraphTensor* alphaTimesTensor = [mpsGraph multiplicationWithPrimaryTensor:exponentMinusOneTensor
-                                                                   secondaryTensor:alphaTensor
-                                                                              name:nil];
-      MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:inputTensor
-                                                               secondaryTensor:zeroTensor
-                                                                          name:nil];
-      MPSGraphTensor* fusedOutput = [mpsGraph selectWithPredicateTensor:predicateTensor
-                                                    truePredicateTensor:inputTensor
-                                                   falsePredicateTensor:alphaTimesTensor
-                                                                   name:nil];
-      MPSGraphTensor* outputTensor = [mpsGraph multiplicationWithPrimaryTensor:fusedOutput
-                                                               secondaryTensor:scaleTensor
-                                                                          name:nil];
-
-      newCachedGraph->inputTensor_ = inputTensor;
-      newCachedGraph->outputTensor_ = outputTensor;
-    });
-
-    auto selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, nil, executeGatherOp);
-    auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out.has_storage() ? out : result, nil, false);
-    auto feeds = dictionaryFromPlaceholders(selfPlaceholder);
-    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
-    if (out.has_storage()) {
-      result.copy_(out);
-    }
-  }
-}
-
-// scale * (max(0, x) + min(0, alpha * (exp(input_scale * x) - 1) ))
-TORCH_IMPL_FUNC(elu_out_mps)
-(const Tensor& self, const Scalar& alpha, const Scalar& scale, const Scalar& input_scale, const Tensor& result) {
-  elu_variants_out_mps(self, alpha, scale, input_scale, result, "elu_out_mps");
-}
-
-TORCH_IMPL_FUNC(elu_backward_out_mps)
-(const Tensor& grad_output,
- const Scalar& alpha,
- const Scalar& scale,
- const Scalar& input_scale,
- bool is_result,
- const Tensor& self_or_result,
- const Tensor& grad_input) {
-  using namespace mps;
-  using CachedGraph = MPSUnaryGradCachedGraph;
-  auto gradMemFormat = grad_input.suggest_memory_format();
-  bool executeGatherOp = !(grad_output.is_contiguous(gradMemFormat) && self_or_result.is_contiguous(gradMemFormat) &&
-                           grad_input.is_contiguous(gradMemFormat));
-  Tensor out;
-  if (executeGatherOp && gradMemFormat == MemoryFormat::ChannelsLast) {
-    out = at::empty_like(grad_input, MemoryFormat::Contiguous);
-  }
-
-  // Empty output
-  if (grad_input.numel() == 0) {
-    return;
-  }
-
-  MPSStream* stream = getCurrentMPSStream();
-
-  @autoreleasepool {
-    std::string key = "elu_backward_out_mps:" + getTensorsStringKey({grad_output, self_or_result}) + ":" +
-        std::to_string(alpha.to<double>()) + ":" + std::to_string(scale.to<double>()) + ":" +
-        std::to_string(input_scale.to<double>()) + ":" + std::to_string(is_result);
-
-    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
-      MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output);
-      MPSGraphTensor* selfOrResultTensor = mpsGraphRankedPlaceHolder(mpsGraph, self_or_result);
-      MPSGraphTensor* lessThanZeroGradTensor = nil;
-
-      if (is_result) {
-        MPSGraphTensor* alphaTensor = [mpsGraph constantWithScalar:alpha.to<double>()
-                                                             shape:@[ @1 ]
-                                                          dataType:getMPSDataType(grad_output)];
-        MPSGraphTensor* resultPlusAlphaTensor = [mpsGraph additionWithPrimaryTensor:selfOrResultTensor
-                                                                    secondaryTensor:alphaTensor
-                                                                               name:nil];
-        auto constMul = scale.to<double>() * input_scale.to<double>();
-        MPSGraphTensor* constMulTensor = [mpsGraph constantWithScalar:constMul
-                                                                shape:@[ @1 ]
-                                                             dataType:getMPSDataType(grad_output)];
-        lessThanZeroGradTensor = [mpsGraph multiplicationWithPrimaryTensor:resultPlusAlphaTensor
-                                                           secondaryTensor:constMulTensor
-                                                                      name:nil];
-      } else {
-        MPSGraphTensor* inputScaleTensor = [mpsGraph constantWithScalar:input_scale.to<double>()
-                                                                  shape:@[ @1 ]
-                                                               dataType:getMPSDataType(grad_output)];
-        MPSGraphTensor* scaledInputTensor = [mpsGraph multiplicationWithPrimaryTensor:selfOrResultTensor
-                                                                      secondaryTensor:inputScaleTensor
-                                                                                 name:nil];
-        MPSGraphTensor* expTensor = [mpsGraph exponentWithTensor:scaledInputTensor name:nil];
-        auto constMul = scale.to<double>() * input_scale.to<double>() * alpha.to<double>();
-        MPSGraphTensor* constMulTensor = [mpsGraph constantWithScalar:constMul
-                                                                shape:@[ @1 ]
-                                                             dataType:getMPSDataType(grad_output)];
-        lessThanZeroGradTensor = [mpsGraph multiplicationWithPrimaryTensor:expTensor
-                                                           secondaryTensor:constMulTensor
-                                                                      name:nil];
-      }
-
-      MPSGraphTensor* scaleTensor = [mpsGraph constantWithScalar:scale.to<double>()
-                                                           shape:@[ @1 ]
-                                                        dataType:getMPSDataType(grad_output)];
-      MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0f
-                                                          shape:@[ @1 ]
-                                                       dataType:getMPSDataType(grad_output)];
-      MPSGraphTensor* predicateTensor = [mpsGraph greaterThanWithPrimaryTensor:selfOrResultTensor
-                                                               secondaryTensor:zeroTensor
-                                                                          name:nil];
-      MPSGraphTensor* gradTensor = [mpsGraph selectWithPredicateTensor:predicateTensor
-                                                   truePredicateTensor:scaleTensor
-                                                  falsePredicateTensor:lessThanZeroGradTensor
-                                                                  name:nil];
-      MPSGraphTensor* gradInputTensor = [mpsGraph multiplicationWithPrimaryTensor:gradTensor
-                                                                  secondaryTensor:gradOutputTensor
-                                                                             name:nil];
-
-      newCachedGraph->gradOutputTensor_ = gradOutputTensor;
-      newCachedGraph->inputTensor_ = selfOrResultTensor;
-      newCachedGraph->gradInputTensor_ = gradInputTensor;
-    });
-
-    Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output, nil, executeGatherOp);
-    Placeholder selfOrResultPlaceholder = Placeholder(cachedGraph->inputTensor_, self_or_result, nil, executeGatherOp);
-    Placeholder gradInputPlaceholder =
-        Placeholder(cachedGraph->gradInputTensor_, out.has_storage() ? out : grad_input, nil, false);
-
-    auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, selfOrResultPlaceholder);
-    runMPSGraph(stream, cachedGraph->graph(), feeds, gradInputPlaceholder);
-    if (out.has_storage()) {
-      grad_input.copy_(out);
-    }
-  }
-}
-
 TORCH_IMPL_FUNC(glu_out_mps)(const Tensor& self, const int64_t dim, const Tensor& output) {
  using namespace mps;
  using CachedGraph = MPSUnaryCachedGraph;
--- a/aten/src/ATen/native/mps/operations/ActivationKernel.mm
+++ b/aten/src/ATen/native/mps/operations/ActivationKernel.mm
@ -1,8 +1,10 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/Dispatch.h>
 #include <ATen/TensorIterator.h>
 #include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/Activation.h>
 #include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/mps/kernels/Activation.h>
 #include <fmt/format.h>

 namespace at::native {
@ -41,6 +43,30 @@ static void hardswish_backward_kernel(at::TensorIterator& iter) {
  lib.exec_binary_kernel(iter, "hardswish_backward");
 }

+static void elu_kernel(TensorIteratorBase& iter, const Scalar& alpha, const Scalar& scale, const Scalar& input_scale) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(c10::kHalf, c10::kBFloat16, iter.common_dtype(), "elu_mps", [&]() {
+    ELUParams<scalar_t> params{alpha.to<scalar_t>(), scale.to<scalar_t>(), input_scale.to<scalar_t>()};
+    lib.exec_unary_kernel_with_params(
+        iter, "elu", params, fmt::format("ELUParams_{}", mps::scalarToMetalTypeString(iter.common_dtype())));
+  });
+}
+
+static void elu_backward_kernel(TensorIteratorBase& iter,
+                                const Scalar& alpha,
+                                const Scalar& scale,
+                                const Scalar& input_scale,
+                                bool is_result) {
+  AT_DISPATCH_FLOATING_TYPES_AND2(c10::kHalf, c10::kBFloat16, iter.common_dtype(), "elu_backward_mps", [&]() {
+    ELUBackwardParams<scalar_t> params{
+        alpha.to<scalar_t>(), scale.to<scalar_t>(), input_scale.to<scalar_t>(), is_result};
+    lib.exec_binary_kernel_with_params(
+        iter,
+        "elu_backward",
+        params,
+        fmt::format("ELUBackwardParams_{}", mps::scalarToMetalTypeString(iter.common_dtype())));
+  });
+}
+
 static void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negative_slope) {
  lib.exec_unary_kernel(iter, "leaky_relu", negative_slope);
 }
@ -56,6 +82,8 @@ REGISTER_DISPATCH(hardsigmoid_stub, hardsigmoid_kernel);
 REGISTER_DISPATCH(hardsigmoid_backward_stub, hardsigmoid_backward_kernel);
 REGISTER_DISPATCH(hardswish_stub, hardswish_kernel);
 REGISTER_DISPATCH(hardswish_backward_stub, hardswish_backward_kernel);
+REGISTER_DISPATCH(elu_stub, elu_kernel);
+REGISTER_DISPATCH(elu_backward_stub, elu_backward_kernel);
 REGISTER_DISPATCH(leaky_relu_stub, leaky_relu_kernel);
 REGISTER_DISPATCH(leaky_relu_backward_stub, leaky_relu_backward_kernel);

--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -12064,8 +12064,7 @@
  device_check: NoCheck   # TensorIterator
  python_module: nn
  dispatch:
-    CPU, CUDA: elu_out
-    MPS: elu_out_mps
+    CPU, CUDA, MPS: elu_out

 - func: elu(Tensor self, Scalar alpha=1, Scalar scale=1, Scalar input_scale=1) -> Tensor
  structured_delegate: elu.out
@ -12078,8 +12077,7 @@
  structured_inherits: TensorIteratorBase
  python_module: nn
  dispatch:
-    CPU, CUDA: elu_backward_out
-    MPS: elu_backward_out_mps
+    CPU, CUDA, MPS: elu_backward_out

 - func: elu_backward(Tensor grad_output, Scalar alpha, Scalar scale, Scalar input_scale, bool is_result, Tensor self_or_result) -> Tensor
  structured_delegate: elu_backward.grad_input
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@ -828,9 +828,6 @@ inductor_one_sample["cuda"] = {
    "nn.functional.fractional_max_pool3d": {f16, f32, f64},
    "nn.functional.group_norm": {f16},
    "nn.functional.hinge_embedding_loss": {f16},
-    # Enabling all tests for this test fails randomly
-    # See https://github.com/pytorch/pytorch/issues/129238
-    "nn.functional.huber_loss": {f16},
    "nn.functional.interpolate.bicubic": {f16},
    "nn.functional.interpolate.bilinear": {f16},
    "nn.functional.interpolate.trilinear": {f16},
@ -948,9 +945,6 @@ inductor_one_sample["xpu"] = {
    "nn.functional.fractional_max_pool3d": {f16, f32, f64},
    "nn.functional.group_norm": {f16},
    "nn.functional.hinge_embedding_loss": {f16},
-    # Enabling all tests for this test fails randomly
-    # See https://github.com/pytorch/pytorch/issues/129238
-    "nn.functional.huber_loss": {f16},
    "nn.functional.interpolate.bicubic": {f16},
    "nn.functional.interpolate.bilinear": {f16},
    "nn.functional.interpolate.trilinear": {f16},
--- a/third_party/fbgemm
+++ b/third_party/fbgemm
--- a/tools/autograd/gen_variable_type.py
+++ b/tools/autograd/gen_variable_type.py
@ -421,7 +421,7 @@ RESET_GRAD_ACCUMULATOR = {"set_", "resize_"}
 #      inplace or out-variants)
 # If the function does not modify its arguments, we also check the following properties
 # pertaining to its output:
-#   2) Its TensorImpl has use_count of 1
+#   2) Its TensorImpl has use_count of 1 (or 2 if it has a PyObject)
 #   3) If the function is a view function, it has the same StorageImpl as that of
 #      the input it is aliased with. Otherwise, its StorageImpl has use_count of 1
 #
@ -496,10 +496,10 @@ if (${tensor_name}_impl_saved && !at::impl::dispatch_mode_enabled() && !at::impl
 """
 )

-ENFORCE_TENSOR_IMPL_USE_COUNT_LT_OR_EQ_ONE = CodeTemplate(
+ENFORCE_TENSOR_IMPL_USE_COUNT = CodeTemplate(
    """\
 if (!at::impl::dispatch_mode_enabled() && !at::impl::tensor_has_dispatch(${tensor_name}))
-  TORCH_INTERNAL_ASSERT(${tensor_name}.use_count() <= 1, "function: ${fn_name}");
+  TORCH_INTERNAL_ASSERT(${tensor_name}.use_count() == expected_fresh_use_count(${tensor_name}), "function: ${fn_name}");
 """
 )

@ -1664,7 +1664,7 @@ def emit_body(

                    if type_wrapper_name(f) not in DONT_ENFORCE_TENSOR_IMPL_USE_COUNT:
                        stmts_after_call += [
-                            ENFORCE_TENSOR_IMPL_USE_COUNT_LT_OR_EQ_ONE.substitute(
+                            ENFORCE_TENSOR_IMPL_USE_COUNT.substitute(
                                tensor_name=ret_name, fn_name=type_wrapper_name(f)
                            )
                        ]
--- a/tools/autograd/templates/VariableType.cpp
+++ b/tools/autograd/templates/VariableType.cpp
@ -47,6 +47,18 @@ namespace{
    meta->grad_accumulator_.reset();
  }
 }
+[[maybe_unused]] size_t expected_fresh_use_count(const Variable& self) {
+  if (!self.defined()) {
+    // An UndefinedTensorImpl always has a use count of 0
+    return 0;
+  }
+  if (self.unsafeGetTensorImpl()->pyobj_slot()->load_pyobj() != nullptr) {
+    // A TensorImpl with a Python object has a use count of 2
+    return 2;
+  }
+  // A fresh TensorImpl (with no PyObject) has a use count of 1
+  return 1;
+}
 }

 namespace {
--- a/torch/csrc/Exceptions.h
+++ b/torch/csrc/Exceptions.h
@ -138,7 +138,7 @@ inline void PyErr_SetString(PyObject* type, const std::string& message) {
    throw;                                                          \
  }                                                                 \
  }                                                                 \
-  catch (const std::exception& e) {                                 \
+  catch (const std::exception&) {                                   \
    torch::translate_exception_to_python(std::current_exception()); \
    return retval;                                                  \
  }
--- a/torch/csrc/distributed/c10d/ProcessGroup.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.cpp
@ -81,7 +81,7 @@ c10::intrusive_ptr<Backend> ProcessGroup::getBackend(
  ProcessGroup::BackendType backendType{ProcessGroup::BackendType::UNDEFINED};
  try {
    backendType = deviceTypeToBackendType_.at(deviceType);
-  } catch (const std::out_of_range& e) {
+  } catch (const std::out_of_range&) {
    TORCH_CHECK(
        false, "No backend type associated with device type ", deviceType);
  }
--- a/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp
+++ b/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp
@ -246,7 +246,7 @@ class UvTcpServer : public UvTcpSocket {
              uv_err_name(uv_res),
              uv_strerror(uv_res)));
      res->cacheSocketPort();
-    } catch (std::exception& ex) {
+    } catch (std::exception&) {
      res->close();
      throw;
    }
@ -322,7 +322,7 @@ class UvTcpServer : public UvTcpSocket {
              uv_err_name(uv_res),
              uv_strerror(uv_res)));
      res->cacheSocketPort();
-    } catch (std::exception& ex) {
+    } catch (std::exception&) {
      res->close();
      throw;
    }
--- a/torch/csrc/fx/node.cpp
+++ b/torch/csrc/fx/node.cpp
@ -353,7 +353,7 @@ static PyObject* NodeBase__update_args_kwargs(
    Py_CLEAR(node->_kwargs);
    node->_kwargs = map_aggregate(args[1], visit_fn);
    Py_RETURN_NONE;
-  } catch (const PythonError& e) {
+  } catch (const PythonError&) {
    return nullptr;
  }
 }
@ -397,7 +397,7 @@ static PyObject* NodeBase__replace_input_with(

    PyObject* update_args[2] = {new_args.get(), new_kwargs.get()};
    return NodeBase__update_args_kwargs(self, update_args, 2);
-  } catch (const PythonError& e) {
+  } catch (const PythonError&) {
    return nullptr;
  }
 }
@ -802,7 +802,7 @@ static PyObject* py_map_aggregate(
    // args[0]: aggregate, args[1]: callable fn
    return map_aggregate(
        args[0], [fn](PyObject* a) { return PyObject_CallOneArg(fn, a); });
-  } catch (const PythonError& e) {
+  } catch (const PythonError&) {
    return nullptr; // error should already be set
  }
 }
@ -824,7 +824,7 @@ static PyObject* py_map_arg(
      }
      return Py_NewRef(a);
    });
-  } catch (const PythonError& e) {
+  } catch (const PythonError&) {
    return nullptr; // error should already be set
  }
 }
--- a/torch/csrc/jit/python/pybind.h
+++ b/torch/csrc/jit/python/pybind.h
@ -117,7 +117,7 @@ struct type_caster<torch::jit::IValue> {
    try {
      value = torch::jit::toTypeInferredIValue(src);
      return true;
-    } catch (std::exception& e) {
+    } catch (std::exception&) {
      return false;
    }
  }
@ -142,7 +142,7 @@ struct type_caster<torch::jit::Symbol> {
    std::string src_str;
    try {
      src_str = py::cast<std::string>(src);
-    } catch (std::exception& e) {
+    } catch (std::exception&) {
      return false;
    }
    value = torch::jit::Symbol::fromQualString(src_str);
--- a/torch/csrc/stable/stableivalue_conversions.h
+++ b/torch/csrc/stable/stableivalue_conversions.h
@ -285,7 +285,7 @@ struct FromImpl<torch::headeronly::HeaderOnlyArrayRef<T>> {
            torch_list_push_back(new_list_handle, from(elem)));
      }
      return from(new_list_handle);
-    } catch (const std::runtime_error& e) {
+    } catch (const std::runtime_error&) {
      if (new_list_handle != nullptr) {
        // clean up memory if an error was thrown
        TORCH_ERROR_CODE_CHECK(torch_delete_list(new_list_handle));
@ -553,7 +553,7 @@ struct ToImpl<std::vector<T>> {
      }
      TORCH_ERROR_CODE_CHECK(torch_delete_list(list_handle));
      return result;
-    } catch (const std::runtime_error& e) {
+    } catch (const std::runtime_error&) {
      // clean up memory if an exception is thrown, and rethrow
      TORCH_ERROR_CODE_CHECK(torch_delete_list(list_handle));
      throw;
Author	SHA1	Message	Date
Facebook Community Bot	989cf62265	Automated submodule update: FBGEMM	2025-11-18 14:43:11 -08:00
Kurt Mohler	dc4f3c7505	[MPS] Move `elu` impl to Metal (#166903 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/166903 Approved by: https://github.com/malfet	2025-11-18 22:32:00 +00:00
Nikita Shulga	e8970ba010	[CI] Migrate all gcc9 jobs to gcc11 (#167933 ) As compiler has not been supported for last 3 years and all manylinux2_28 builds should have at least gcc-11 Prep change for C++20 standard migration Pull Request resolved: https://github.com/pytorch/pytorch/pull/167933 Approved by: https://github.com/yangw-dev, https://github.com/atalman ghstack dependencies: #168090	2025-11-18 22:04:53 +00:00
Sam Gross	41999a579d	Fix Tensor use_count check in VariableType.cpp (#168060 ) Summary: If the Tensor has a PyObject, it's use count will now be two instead of one. Test Plan: `buck test -j 18 fbcode//mode/dev-nosan fbcode//caffe2/test:torch` Differential Revision: D87297965 Pull Request resolved: https://github.com/pytorch/pytorch/pull/168060 Approved by: https://github.com/albanD, https://github.com/Skylion007	2025-11-18 22:02:02 +00:00
Richard Barnes	ebb2001a48	[codemod][lowrisk] Remove unused exception parameter from caffe2/torch/csrc/Exceptions.h (#168056 ) Summary: `-Wunused-exception-parameter` has identified an unused exception parameter. This diff removes it. This: ``` try { ... } catch (exception& e) { // no use of e } ``` should instead be written as ``` } catch (exception&) { ``` If the code compiles, this is safe to land. Test Plan: Sandcastle Reviewed By: dtolnay Differential Revision: D87273132 Pull Request resolved: https://github.com/pytorch/pytorch/pull/168056 Approved by: https://github.com/malfet, https://github.com/Skylion007	2025-11-18 20:21:48 +00:00
Justin Turney	ae85307512	huber_loss numerical issue (#166952 ) For GPU: Previously reported that only a single sample could be tested with huber_loss functional. Current snapshot of the code does not appear to suffer from numerical issues as reported before. For CPU: While testing GPU, it was discovered that with Half appears to be numerically unstable. This commit resolves issue with CPU by upcasting Half to float for the computation. Pull Request resolved: https://github.com/pytorch/pytorch/pull/166952 Approved by: https://github.com/benjaminglass1, https://github.com/isuruf	2025-11-18 20:06:29 +00:00
Jithun Nair	7921c0eb0e	[ROCm][CI] Limit caching to ROCm jammy docker images (#168088 ) Since the currently intended workflow on the new MI3xx CI capacity is [trunk-rocm-mi300.yml](`d91269e8ce/.github/workflows/trunk-rocm-mi300.yml (L54)`), which only needs the jammy images, limiting those to optimize docker caching times. Pull Request resolved: https://github.com/pytorch/pytorch/pull/168088 Approved by: https://github.com/jeffdaily	2025-11-18 20:04:20 +00:00