chore: trigger CI

Add cancel-in-progress for re-run of CI jobs
[MPSInductor] Unskip test_repeat_interleave_Tensor_decomp (#164136 )
2025-10-24 07:27:32 +08:00 · 2025-09-29 11:22:41 -07:00 · 2025-09-29 09:43:00 -07:00 · 2025-09-29 16:20:34 +00:00 · 2025-09-29 16:10:26 +00:00 · 2025-09-29 16:09:46 +00:00
345 changed files with 5969 additions and 3327 deletions
--- a/.ci/docker/ci_commit_pins/nccl-cu12.txt
+++ b/.ci/docker/ci_commit_pins/nccl-cu12.txt
@ -1 +1 @@
-v2.27.5-1
+v2.28.3-1
--- a/.ci/docker/ci_commit_pins/nccl-cu13.txt
+++ b/.ci/docker/ci_commit_pins/nccl-cu13.txt
@ -1 +1 @@
-v2.27.7-1
+v2.28.3-1
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@ -12,8 +12,8 @@ function do_install() {

    rocm_version_nodot=${rocm_version//./}

-    # Version 2.7.2 + ROCm related updates
-    MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
+    # https://github.com/icl-utk-edu/magma/pull/65
+    MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
    magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"

    rocm_dir="/opt/rocm"
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,8 +1,15 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@d53b0ffb9b1cda68260693ea98f3483823c88d8e#egg=pytorch_sphinx_theme2

+standard-imghdr==3.13.0; python_version >= "3.13"
+#Description: This is needed by Sphinx, so it needs to be added here.
+# The reasons are as follows:
+# 1) This module has been removed from the Python standard library since Python 3.13(https://peps.python.org/pep-0594/#imghdr);
+# 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13.
+# Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency.
+
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@d53b0ffb9b1cda68260693ea98f3483823c88d8e#egg=pytorch_sphinx_theme2
 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
 # something related to Docker setup. We can investigate this later.
--- a/.ci/manywheel/build_libtorch.sh
+++ b/.ci/manywheel/build_libtorch.sh
@ -104,7 +104,7 @@ if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
    export ROCclr_DIR=/opt/rocm/rocclr/lib/cmake/rocclr
 fi

-echo "Calling 'python -m pip install .' at $(date)"
+echo "Calling -m pip install . -v --no-build-isolation at $(date)"

 if [[ $LIBTORCH_VARIANT = *"static"* ]]; then
    STATIC_CMAKE_FLAG="-DTORCH_STATIC=1"
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -1617,7 +1617,7 @@ test_operator_benchmark() {
  test_inductor_set_cpu_affinity

  cd benchmarks/operator_benchmark/pt_extension
-  python -m pip install .
+  python -m pip install . -v --no-build-isolation

  cd "${TEST_DIR}"/benchmarks/operator_benchmark
  $TASKSET python -m benchmark_all_test --device "$1" --tag-filter "$2" \
@ -1630,6 +1630,25 @@ test_operator_benchmark() {
      --expected "expected_ci_operator_benchmark_eager_float32_cpu.csv"
 }

+test_operator_microbenchmark() {
+  TEST_REPORTS_DIR=$(pwd)/test/test-reports
+  mkdir -p "$TEST_REPORTS_DIR"
+  TEST_DIR=$(pwd)
+
+  cd benchmarks/operator_benchmark/pt_extension
+  python -m pip install .
+
+  cd "${TEST_DIR}"/benchmarks/operator_benchmark
+
+  for OP_BENCHMARK_TESTS in matmul mm addmm bmm; do
+    $TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
+      --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}_compile.json" \
+      --benchmark-name "PyTorch operator microbenchmark" --use-compile
+    $TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
+      --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}.json" \
+      --benchmark-name "PyTorch operator microbenchmark"
+  done
+}

 if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
  (cd test && python -c "import torch; print(torch.__config__.show())")
@ -1686,6 +1705,8 @@ elif [[ "${TEST_CONFIG}" == *operator_benchmark* ]]; then
    test_operator_benchmark cpu ${TEST_MODE}

  fi
+elif [[ "${TEST_CONFIG}" == *operator_microbenchmark* ]]; then
+  test_operator_microbenchmark
 elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
  test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
@ -1794,6 +1815,8 @@ elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
  test_h100_distributed
 elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then
  test_h100_symm_mem
+elif [[ "${TEST_CONFIG}" == "b200-symm-mem" ]]; then
+  test_h100_symm_mem
 elif [[ "${TEST_CONFIG}" == h100_cutlass_backend ]]; then
  test_h100_cutlass_backend
 else
--- a/.ci/pytorch/windows/internal/static_lib_test.bat
+++ b/.ci/pytorch/windows/internal/static_lib_test.bat
@ -63,7 +63,7 @@ if errorlevel 1 exit /b 1
 call %CONDA_HOME%\condabin\activate.bat testenv
 if errorlevel 1 exit /b 1

-call conda install  -y -q -c conda-forge libuv=1.39
+call conda install  -y -q -c conda-forge libuv=1.51
 call conda install -y -q intel-openmp

 echo "install and test libtorch"
--- a/.circleci/scripts/functorch_doc_push_script.sh
+++ b/.circleci/scripts/functorch_doc_push_script.sh
@ -1,47 +0,0 @@
-#!/bin/bash
-# =================== The following code **should** be executed inside Docker container ===================
-
-# Install dependencies
-sudo apt-get -y update
-sudo apt-get -y install expect-dev
-
-# This is where the local pytorch install in the docker image is located
-pt_checkout="/var/lib/jenkins/workspace"
-source "$pt_checkout/.ci/pytorch/common_utils.sh"
-echo "functorch_doc_push_script.sh: Invoked with $*"
-
-set -ex
-
-version=${DOCS_VERSION:-nightly}
-echo "version: $version"
-
-# Build functorch docs
-pushd $pt_checkout/functorch/docs
-pip -q install -r requirements.txt
-make html
-popd
-
-git clone https://github.com/pytorch/functorch -b gh-pages --depth 1 functorch_ghpages
-pushd functorch_ghpages
-
-if [ $version == "main" ]; then
-  version=nightly
-fi
-
-git rm -rf "$version" || true
-mv "$pt_checkout/functorch/docs/build/html" "$version"
-
-git add "$version" || true
-git status
-git config user.email "soumith+bot@pytorch.org"
-git config user.name "pytorchbot"
-# If there aren't changes, don't make a commit; push is no-op
-git commit -m "Generate Python docs from pytorch/pytorch@${GITHUB_SHA}" || true
-git status
-
-if [[ "${WITH_PUSH:-}" == true ]]; then
-  git push -u origin gh-pages
-fi
-
-popd
-# =================== The above code **should** be executed inside Docker container ===================
--- a/.clang-tidy
+++ b/.clang-tidy
@ -69,6 +69,8 @@ readability-string-compare,
 '
 HeaderFilterRegex: '^(aten/|c10/|torch/).*$'
 WarningsAsErrors: '*'
+LineFilter:
+  - name: '/usr/include/.*'
 CheckOptions:
  cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor: true
  cppcoreguidelines-special-member-functions.AllowImplicitlyDeletedCopyOrMove: true
--- a/.github/ISSUE_TEMPLATE/ci-sev.md
+++ b/.github/ISSUE_TEMPLATE/ci-sev.md
@ -1,6 +1,10 @@
 ---
 name: "⚠️ CI SEV"
 about: Tracking incidents for PyTorch's CI infra.
+title: ''
+labels: ''
+assignees: ''
+
 ---

 > NOTE: Remember to label this issue with "`ci: sev`"
--- a/.github/ISSUE_TEMPLATE/disable-autorevert.md
+++ b/.github/ISSUE_TEMPLATE/disable-autorevert.md
@ -0,0 +1,18 @@
+---
+name: DISABLE AUTOREVERT
+about: Disables autorevert when open
+title: "❌\U0001F519 [DISABLE AUTOREVERT]"
+labels: 'ci: disable-autorevert'
+assignees: ''
+
+---
+
+This issue, while open, disables the autorevert functionality.
+
+More details can be found [here](https://github.com/pytorch/test-infra/blob/main/aws/lambda/pytorch-auto-revert/README.md)
+
+
+## Why are you disabling autorevert?
+
+
+## Links to any issues/commits/errors that shows the source of problem
--- a/.github/ISSUE_TEMPLATE/disable-ci-jobs.md
+++ b/.github/ISSUE_TEMPLATE/disable-ci-jobs.md
@ -1,8 +1,10 @@
 ---
 name: Disable CI jobs (PyTorch Dev Infra only)
 about: Use this template to disable CI jobs
-title: "DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME]"
-labels: "module: ci"
+title: DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME]
+labels: 'module: ci'
+assignees: ''
+
 ---

 > For example, DISABLED pull / win-vs2022-cpu-py3 / test (default). Once
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-9fe4c2bdb9859c14ad7f7479e1db7e01083bada3
+0307428d65acf5cf1a73a70a7722e076bbb83f22
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-c77852e117bdf056c8e9a087e51d6f65cf6ba53d
+0fc62aa26a30ed7ca419d285f285cb5ba02c4394
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -1,43 +1,44 @@
 tracking_issue: 24422
 ciflow_tracking_issue: 64124
 ciflow_push_tags:
+- ciflow/b200
+- ciflow/b200-symm-mem
 - ciflow/binaries
 - ciflow/binaries_libtorch
 - ciflow/binaries_wheel
- ciflow/triton_binaries
+- ciflow/h100
+- ciflow/h100-cutlass-backend
+- ciflow/h100-distributed
+- ciflow/h100-symm-mem
 - ciflow/inductor
- ciflow/inductor-periodic
- ciflow/inductor-rocm
- ciflow/inductor-perf-test-nightly-rocm
- ciflow/inductor-perf-compare
+- ciflow/inductor-cu126
 - ciflow/inductor-micro-benchmark
 - ciflow/inductor-micro-benchmark-cpu-x86
+- ciflow/inductor-perf-compare
+- ciflow/inductor-perf-test-nightly-rocm
 - ciflow/inductor-perf-test-nightly-x86-zen
- ciflow/inductor-cu126
+- ciflow/inductor-periodic
+- ciflow/inductor-rocm
 - ciflow/linux-aarch64
 - ciflow/mps
 - ciflow/nightly
+- ciflow/op-benchmark
 - ciflow/periodic
 - ciflow/periodic-rocm-mi300
+- ciflow/pull
 - ciflow/quantization-periodic
+- ciflow/riscv64
 - ciflow/rocm
 - ciflow/rocm-mi300
 - ciflow/s390
- ciflow/riscv64
 - ciflow/slow
+- ciflow/torchbench
+- ciflow/triton_binaries
 - ciflow/trunk
 - ciflow/unstable
- ciflow/xpu
 - ciflow/vllm
- ciflow/torchbench
- ciflow/op-benchmark
- ciflow/pull
- ciflow/h100
- ciflow/h100-distributed
 - ciflow/win-arm64
- ciflow/h100-symm-mem
- ciflow/h100-cutlass-backend
- ciflow/b200
+- ciflow/xpu
 retryable_workflows:
 - pull
 - trunk
@ -46,4 +47,4 @@ retryable_workflows:
 - inductor-A100-perf-nightly
 labeler_config: labeler.yml
 label_to_label_config: label_to_label.yml
-mergebot: True
+mergebot: true
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -30,7 +30,7 @@ CUDA_ARCHES_CUDNN_VERSION = {
 }

 # NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this
-ROCM_ARCHES = ["6.3", "6.4"]
+ROCM_ARCHES = ["6.4", "7.0"]

 XPU_ARCHES = ["xpu"]

@ -53,7 +53,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | "
        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
-        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
+        "nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | "
        "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | "
        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | "
@ -70,7 +70,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | "
        "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
-        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
+        "nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | "
        "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
        "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | "
        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | "
@ -87,7 +87,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | "
        "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
-        "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | "
+        "nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | "
        "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | "
        "nvidia-nvtx==13.0.39; platform_system == 'Linux' | "
        "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | "
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -273,6 +273,8 @@ jobs:
          TEST_CONFIG: ${{ matrix.config }}
          SHARD_NUMBER: ${{ matrix.shard }}
          NUM_TEST_SHARDS: ${{ matrix.num_shards }}
+          EXTRA_FLAGS: ${{ matrix.extra_flags || '' }}
+          OP_BENCHMARK_TESTS: ${{ matrix.op_benchmark_tests }}
          REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
          CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
--- a/.github/workflows/b200-symm-mem.yml
+++ b/.github/workflows/b200-symm-mem.yml
@ -0,0 +1,60 @@
+name: Limited CI for symmetric memory tests on B200
+
+on:
+  pull_request:
+    paths:
+      - .github/workflows/b200-symm-mem.yml
+  workflow_dispatch:
+  push:
+    tags:
+      - ciflow/b200-symm-mem/*
+  schedule:
+    - cron: 22 8 * * *  # about 1:22am PDT
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+
+  get-label-type:
+    if: github.repository_owner == 'pytorch'
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
+  linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runner: linux.12xlarge.memory
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '10.0'
+      test-matrix: |
+        { include: [
+          { config: "b200-symm-mem", shard: 1, num_shards: 1, runner: "linux.dgx.b200.8" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc11-sm100-test:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build-symm.outputs.test-matrix }}
+      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+    secrets: inherit
--- a/.github/workflows/build-libtorch-images.yml
+++ b/.github/workflows/build-libtorch-images.yml
@ -52,7 +52,6 @@ jobs:
          { tag: "cuda12.9" },
          { tag: "cuda12.8" },
          { tag: "cuda12.6" },
-          { tag: "rocm6.3"  },
          { tag: "rocm6.4"  },
          { tag: "rocm7.0"  },
          { tag: "cpu"      },
--- a/.github/workflows/build-magma-rocm-linux.yml
+++ b/.github/workflows/build-magma-rocm-linux.yml
@ -34,7 +34,7 @@ jobs:
      id-token: write
    strategy:
      matrix:
-        rocm_version: ["70", "64", "63"]
+        rocm_version: ["70", "64"]
    steps:
      - name: Checkout PyTorch
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@ -52,7 +52,6 @@ jobs:
          { name: "manylinuxaarch64-builder",       tag: "cuda13.0",          runner: "linux.arm64.2xlarge.ephemeral" },
          { name: "manylinuxaarch64-builder",       tag: "cuda12.8",          runner: "linux.arm64.2xlarge.ephemeral" },
          { name: "manylinuxaarch64-builder",       tag: "cuda12.6",          runner: "linux.arm64.2xlarge.ephemeral" },
-          { name: "manylinux2_28-builder",          tag: "rocm6.3",           runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "rocm6.4",           runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "rocm7.0",           runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "cpu",               runner: "linux.9xlarge.ephemeral" },
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -55,7 +55,7 @@ jobs:
        docker-image: ["pytorch/manylinux2_28-builder:cpu"]
        include:
          - device: "rocm"
-            rocm_version: "6.4"
+            rocm_version: "7.0"
            runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
          - device: "cuda"
            rocm_version: ""
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -132,7 +132,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -178,7 +178,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -224,7 +224,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -335,7 +335,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -381,7 +381,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -427,7 +427,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -538,7 +538,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -584,7 +584,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -630,7 +630,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -741,7 +741,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -787,7 +787,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -833,7 +833,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -944,7 +944,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -990,7 +990,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1036,7 +1036,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1147,7 +1147,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1193,7 +1193,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1239,7 +1239,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1350,7 +1350,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1396,7 +1396,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1442,7 +1442,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
@ -316,121 +316,6 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

-  libtorch-rocm6_3-shared-with-deps-release-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: libtorch-cxx11-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
-      build_name: libtorch-rocm6_3-shared-with-deps-release
-      build_environment: linux-binary-libtorch
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-rocm6_3-shared-with-deps-release-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - libtorch-rocm6_3-shared-with-deps-release-build
-      - get-label-type
-    runs-on: linux.rocm.gpu.mi250
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
-      GPU_ARCH_TYPE: rocm
-      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: libtorch-cxx11-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-    steps:
-      - name: Setup ROCm
-        uses: ./.github/actions/setup-rocm
-      - uses: actions/download-artifact@v4.1.7
-        name: Download Build Artifacts
-        with:
-          name: libtorch-rocm6_3-shared-with-deps-release
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: ROCm set GPU_FLAG
-        run: |
-          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
-      - name: configure aws credentials
-        id: aws_creds
-        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-          aws-region: us-east-1
-          role-duration-seconds: 18000
-      - name: Calculate docker image
-        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
-          docker-image-name: libtorch-cxx11-builder
-          custom-tag-prefix: rocm6.3
-          docker-build-dir: .ci/docker
-          working-directory: pytorch
-      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
-      - name: Test Pytorch binary
-        uses: ./pytorch/.github/actions/test-pytorch-binary
-        env:
-          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
-      - name: Teardown ROCm
-        uses: ./.github/actions/teardown-rocm
-  libtorch-rocm6_3-shared-with-deps-release-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: libtorch-rocm6_3-shared-with-deps-release-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: libtorch-cxx11-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      build_name: libtorch-rocm6_3-shared-with-deps-release
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  libtorch-rocm6_4-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -545,3 +430,118 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
+
+  libtorch-rocm7_0-shared-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm7.0
+      GPU_ARCH_VERSION: "7.0"
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
+      build_name: libtorch-rocm7_0-shared-with-deps-release
+      build_environment: linux-binary-libtorch
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-rocm7_0-shared-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-rocm7_0-shared-with-deps-release-build
+      - get-label-type
+    runs-on: linux.rocm.gpu.mi250
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm7.0
+      GPU_ARCH_VERSION: "7.0"
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: libtorch-rocm7_0-shared-with-deps-release
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: libtorch-cxx11-builder
+          custom-tag-prefix: rocm7.0
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+  libtorch-rocm7_0-shared-with-deps-release-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-rocm7_0-shared-with-deps-release-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm7.0
+      GPU_ARCH_VERSION: "7.0"
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm7.0
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      build_name: libtorch-rocm7_0-shared-with-deps-release
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -60,7 +60,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.28.3; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda13_0-test:  # Testing
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
--- a/.github/workflows/operator_benchmark.yml
+++ b/.github/workflows/operator_benchmark.yml
@ -20,7 +20,7 @@ on:
      - .github/workflows/operator_benchmark.yml

 concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  group: ${{ github.workflow }}-${{ (github.event_name == 'pull_request' && github.head_ref) || github.ref_name }}
  cancel-in-progress: true

 permissions:
--- a/.github/workflows/operator_microbenchmark.yml
+++ b/.github/workflows/operator_microbenchmark.yml
@ -0,0 +1,46 @@
+name: operator_microbenchmark
+
+on:
+  push:
+    tags:
+      - ciflow/op-benchmark/*
+  workflow_dispatch:
+  schedule:
+    # Run at 06:00 UTC everyday
+    - cron: 0 6 * * *
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  opmicrobenchmark-build:
+    if: github.repository_owner == 'pytorch'
+    name: opmicrobenchmark-build
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      runner: linux.12xlarge.memory
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '8.0 9.0'
+      test-matrix: |
+        { include: [
+          { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.h100" },
+          { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
+        ]}
+    secrets: inherit
+
+  opmicrobenchmark-test:
+    name: opmicrobenchmark-test
+    uses: ./.github/workflows/_linux-test.yml
+    needs: opmicrobenchmark-build
+    with:
+      timeout-minutes: 500
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image: ${{ needs.opmicrobenchmark-build.outputs.docker-image }}
+      test-matrix: ${{ needs.opmicrobenchmark-build.outputs.test-matrix }}
+    secrets: inherit
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -442,7 +442,7 @@ if(WIN32)
      message(
        WARNING
          "Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
-          "Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
+          "Please run command 'conda install -c conda-forge libuv=1.51' to install libuv."
      )
    else()
      set(ENV{libuv_ROOT} ${libuv_tmp_LIBRARY}/../../)
--- a/README.md
+++ b/README.md
@ -275,7 +275,7 @@ conda install pkg-config libuv
 pip install mkl-static mkl-include
 # Add these packages if torch.distributed is needed.
 # Distributed package support on Windows is a prototype feature and is subject to changes.
-conda install -c conda-forge libuv
+conda install -c conda-forge libuv=1.51
 ```

 #### Install PyTorch
--- a/aten/src/ATen/InferSize.h
+++ b/aten/src/ATen/InferSize.h
@ -45,7 +45,39 @@ inline void infer_size_impl(
    }
  }

-  auto set_infer_dim = [&]() {
+  if (infer_dim) {
+    // numel is the product of known sizes, it has to be divisible by newsize.
+    // and newsize should be positive unless newsize == numel (we throw
+    // different) error message in that case.
+    if constexpr (std::is_same_v<NumelType, c10::SymInt>) {
+      auto v = newsize.maybe_as_int();
+      if (v and *v == 0) {
+        // Avoid div by 0 when sym_eq(numel % newsize, 0) is constructed!
+        // which may happen when newsize is not a symbol! if its a symbol
+        // division won't happen anyway during compile.
+        TORCH_MAYBE_SYM_CHECK(
+            numel == newsize,
+            "shape '",
+            shape,
+            "' is invalid for input of size ",
+            numel);
+      } else {
+        auto cond = sym_gt(newsize, 0)
+                        .sym_and(sym_eq(numel % newsize, 0))
+                        .sym_or(sym_eq(numel, newsize));
+        TORCH_MAYBE_SYM_CHECK(
+            cond, "shape '", shape, "' is invalid for input of size ", numel);
+      }
+
+    } else {
+      TORCH_CHECK(
+          (newsize > 0 && (numel % newsize == 0)) || numel == newsize,
+          "shape '",
+          shape,
+          "' is invalid for input of size ",
+          numel);
+    }
+
    // We have a degree of freedom here to select the dimension size; follow
    // NumPy semantics and just bail.  However, a nice error message is needed
    // because users often use `view` as a way to flatten & unflatten
@ -54,18 +86,14 @@ inline void infer_size_impl(
    // works yet
    //   empty_tensor.view(-1, 0)
    // doesn't.
-    TORCH_CHECK(
+    TORCH_MAYBE_SYM_CHECK(
        newsize != 0,
        "cannot reshape tensor of 0 elements into shape ",
        shape,
        " because the unspecified dimension size -1 can be any "
        "value and is ambiguous");
-    res[*infer_dim] = numel / newsize;
-    return;
-  };

-  if (infer_dim && newsize > 0 && numel % newsize == 0) {
-    set_infer_dim();
+    res[*infer_dim] = numel / newsize;
    return;
  }

@ -75,9 +103,6 @@ inline void infer_size_impl(
      shape,
      "' is invalid for input of size ",
      numel);
-  if (infer_dim) {
-    set_infer_dim();
-  }
 }

 inline std::vector<int64_t> infer_size(IntArrayRef shape, int64_t numel) {
--- a/aten/src/ATen/Version.cpp
+++ b/aten/src/ATen/Version.cpp
@ -103,7 +103,9 @@ std::string get_cpu_capability() {
 #elif defined(HAVE_ZVECTOR_CPU_DEFINITION)
    case native::CPUCapability::ZVECTOR:
      return "Z VECTOR";
-#elif defined(HAVE_SVE256_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
+#elif defined(HAVE_SVE_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
+    case native::CPUCapability::SVE128:
+      return "SVE128";
    case native::CPUCapability::SVE256:
      return "SVE256";
 #else
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@ -1234,7 +1234,7 @@ struct TORCH_API TupleType : public NamedType {
  std::shared_ptr<FunctionSchema> schema_;
 };

-// the common supertype of all Enums, only used in operator registraion.
+// the common supertype of all Enums, only used in operator registration.
 // EnumType <: AnyEnumType for all Enums
 struct AnyEnumType;
 using AnyEnumTypePtr = SingletonTypePtr<AnyEnumType>;
--- a/aten/src/ATen/cpu/vec/functional_base.h
+++ b/aten/src/ATen/cpu/vec/functional_base.h
@ -102,8 +102,31 @@ struct VecReduceAllSIMD<float, Op> {
 #endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) &&
       // !defined(C10_MOBILE)

-#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
-    !defined(CPU_CAPABILITY_SVE)
+#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
+#if defined(CPU_CAPABILITY_SVE256)
+template <typename Op>
+struct VecReduceAllSIMD<float, Op> {
+  static inline float apply(
+      const Op& vec_fun,
+      const Vectorized<float>& acc_vec) {
+    using Vec = Vectorized<float>;
+    Vec v = acc_vec;
+    // 128-bit shuffle
+    svuint32_t ind = svdupq_n_u32(4, 5, 6, 7);
+    Vec v1 = svtbl_f32(v, ind);
+    v = vec_fun(v, v1);
+    // 64-bit shuffle
+    ind = svdupq_n_u32(2, 3, 0, 1);
+    v1 = svtbl_f32(v, ind);
+    v = vec_fun(v, v1);
+    // 32-bit shuffle
+    ind = svdupq_n_u32(1, 0, 2, 3);
+    v1 = svtbl_f32(v, ind);
+    v = vec_fun(v, v1);
+    return svlasta(svpfalse(), v);
+  }
+};
+#else
 template <typename Op>
 struct VecReduceAllSIMD<float, Op> {
  static inline float apply(
@ -140,35 +163,8 @@ struct VecReduceAllSIMD<float, std::plus<Vectorized<float>>> {
    return vaddvq_f32(acc_vec);
  }
 };
+#endif // defined(CPU_CAPABILITY_SVE256)
 #endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
-       // && !defined(CPU_CAPABILITY_SVE)
-
-#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
-    defined(CPU_CAPABILITY_SVE256)
-template <typename Op>
-struct VecReduceAllSIMD<float, Op> {
-  static inline float apply(
-      const Op& vec_fun,
-      const Vectorized<float>& acc_vec) {
-    using Vec = Vectorized<float>;
-    Vec v = acc_vec;
-    // 128-bit shuffle
-    svuint32_t ind = svdupq_n_u32(4, 5, 6, 7);
-    Vec v1 = svtbl_f32(v, ind);
-    v = vec_fun(v, v1);
-    // 64-bit shuffle
-    ind = svdupq_n_u32(2, 3, 0, 1);
-    v1 = svtbl_f32(v, ind);
-    v = vec_fun(v, v1);
-    // 32-bit shuffle
-    ind = svdupq_n_u32(1, 0, 2, 3);
-    v1 = svtbl_f32(v, ind);
-    v = vec_fun(v, v1);
-    return svlasta(svpfalse(), v);
-  }
-};
-#endif // defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
-       // && defined(CPU_CAPABILITY_SVE256)

 template <typename scalar_t, typename Op>
 inline scalar_t vec_reduce_all(
--- a/aten/src/ATen/cpu/vec/sve/sve_helper.h
+++ b/aten/src/ATen/cpu/vec/sve/sve_helper.h
@ -1,9 +1,21 @@
 #pragma once

 #include <ATen/cpu/vec/intrinsics.h>
+#include <c10/macros/Macros.h>
+#include <cstdint>

 #include <ATen/cpu/vec/vec_base.h>

+#if defined(__aarch64__) &&                     \
+    (defined(AT_BUILD_ARM_VEC256_WITH_SLEEF) || \
+     defined(AT_BUILD_ARM_VECSVE_WITH_SLEEF))
+#define SLEEF_STATIC_LIBS
+#include <sleef.h>
+#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
+#else
+#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
+#endif
+
 #if defined(CPU_CAPABILITY_SVE)

 // Define the data type of VLS(vector-length specific).
--- a/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_bfloat16.h
@ -2,7 +2,6 @@

 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/sve/sve_helper.h>
-#include <ATen/cpu/vec/sve/vec_common_sve.h>
 #include <ATen/cpu/vec/sve/vec_float.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/util/bit_cast.h>
--- a/aten/src/ATen/cpu/vec/vec.h
+++ b/aten/src/ATen/cpu/vec/vec.h
@ -1,6 +1,8 @@
 #pragma once

-#if defined(CPU_CAPABILITY_AVX512)
+#if defined(__aarch64__)
+#include <ATen/cpu/vec/vec_common_aarch64.h>
+#elif defined(CPU_CAPABILITY_AVX512)
 #include <ATen/cpu/vec/vec512/vec512.h>
 #else
 #include <ATen/cpu/vec/vec128/vec128.h>
@ -11,6 +13,34 @@ namespace at::vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {

+inline std::ostream& operator<<(std::ostream& stream, const c10::qint32& val) {
+  stream << val.val_;
+  return stream;
+}
+inline std::ostream& operator<<(std::ostream& stream, const c10::qint8& val) {
+  stream << static_cast<int>(val.val_);
+  return stream;
+}
+inline std::ostream& operator<<(std::ostream& stream, const c10::quint8& val) {
+  stream << static_cast<unsigned int>(val.val_);
+  return stream;
+}
+
+template <typename T>
+std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
+  T buf[Vectorized<T>::size()];
+  vec.store(buf);
+  stream << "vec[";
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    if (i != 0) {
+      stream << ", ";
+    }
+    stream << buf[i];
+  }
+  stream << "]";
+  return stream;
+}
+
 inline Vectorized<bool> convert_to_bool(Vectorized<int8_t> x) {
  __at_align__ bool buffer[x.size()];
  x.ne(Vectorized<int8_t>(0)).store(buffer);
--- a/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h
@ -2,6 +2,7 @@

 // DO NOT DEFINE STATIC DATA IN THIS HEADER!
 // See Note [Do not compile initializers with AVX]
+#include <ATen/cpu/vec/sve/sve_helper.h>
 #include <ATen/cpu/vec/vec128/vec128_float_neon.h>
 #include <ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h>
 #include <ATen/cpu/vec/vec_base.h>
@ -262,6 +263,13 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
            c10::bit_cast<at_bfloat16_t>(val6.x),
            c10::bit_cast<at_bfloat16_t>(val7.x)}) {}

+#ifdef CPU_CAPABILITY_SVE128
+  Vectorized(svbfloat16_t v) : Vectorized16(svget_neonq(v)) {}
+  operator svbfloat16_t() const {
+    return svset_neonq(svundef_bf16(), values);
+  }
+#endif
+
  static Vectorized<c10::BFloat16> blendv(
      const Vectorized<c10::BFloat16>& a,
      const Vectorized<c10::BFloat16>& b,
@ -374,6 +382,23 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
  Vectorized ge(const Vectorized& other) const;
  Vectorized lt(const Vectorized& other) const;
  Vectorized le(const Vectorized& other) const;
+
+#ifdef CPU_CAPABILITY_SVE128
+
+  template <typename step_t>
+  static Vectorized<BFloat16> arange(
+      BFloat16 base = 0.f,
+      step_t step = static_cast<step_t>(1)) {
+    __at_align__ BFloat16 buffer[size()];
+    for (int64_t i = 0; i < size(); i++) {
+      buffer[i] = base + i * step;
+    }
+    return svget_neonq(
+        svld1_bf16(ptrue, reinterpret_cast<bfloat16_t*>(buffer)));
+  }
+
+#endif // CPU_CAPABILITY_SVE128
+
 }; // Vectorized<c10::BFloat16>

 inline std::tuple<Vectorized<float>, Vectorized<float>> convert_bfloat16_float(
@ -397,6 +422,24 @@ inline Vectorized<c10::BFloat16> convert_float_bfloat16(
  return Vectorized<c10::BFloat16>(at_vcombine_bf16(x1, x2));
 }

+inline void load_fp32_from_bf16(const BFloat16* data, Vectorized<float>& out) {
+  __at_align__ float values[Vectorized<float>::size()];
+  for (const auto k : c10::irange(Vectorized<float>::size())) {
+    values[k] = data[k];
+  }
+  out = Vectorized<float>::loadu(values);
+}
+
+inline void load_fp32_from_bf16(
+    const BFloat16* data,
+    Vectorized<float>& out1,
+    Vectorized<float>& out2) {
+  Vectorized<BFloat16> bf16_vec = Vectorized<BFloat16>::loadu(data);
+  auto floats = convert_bfloat16_float(bf16_vec);
+  out1 = std::get<0>(floats);
+  out2 = std::get<1>(floats);
+}
+
 template <typename Op>
 Vectorized<c10::BFloat16> binary_operator_via_float(
    Op op,
@ -579,6 +622,12 @@ Vectorized<c10::BFloat16> inline fnmsub(
  return -a * b - c;
 }

+#else //
+
+CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16)
+
+LOAD_FP32_NON_VECTORIZED_INIT(BFloat16, bf16)
+
 #endif // !defined(C10_MOBILE) && defined(__aarch64__)

 } // namespace CPU_CAPABILITY
--- a/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
@ -4,7 +4,7 @@

 namespace at::vec {
 inline namespace CPU_CAPABILITY {
-#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
+#if defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256)
 template <typename src_t>
 struct VecConvert<
    float,
@ -60,6 +60,7 @@ struct VecConvert<float, 1, BFloat16, 1> {
  }
 };

-#endif // defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256)
+#endif // defined(__aarch64__) && (!defined(CPU_CAPABILITY_SVE) ||
+       // defined(CPU_CAPABILITY_SVE128))
 } // namespace CPU_CAPABILITY
 } // namespace at::vec
--- a/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
@ -4,13 +4,10 @@
 // See Note [Do not compile initializers with AVX]

 #include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/sve/sve_helper.h>
 #include <ATen/cpu/vec/vec_base.h>
 #include <c10/util/irange.h>

-#if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
-#include <sleef.h>
-#endif
-
 // Sleef offers vectorized versions of some transcedentals
 // such as sin, cos, tan etc..
 // However for now opting for STL, since we are not building
@ -35,12 +32,6 @@ inline namespace CPU_CAPABILITY {
 #error "Big endian is not supported."
 #endif

-#if defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
-#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
-#else
-#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
-#endif
-
 template <int index, bool mask_val>
 struct BlendRegs {
  static float32x4_t impl(
@ -94,6 +85,12 @@ class Vectorized<float> {
  operator float32x4_t() const {
    return values;
  }
+#ifdef CPU_CAPABILITY_SVE128
+  Vectorized(svfloat32_t v) : values(svget_neonq(v)) {}
+  operator svfloat32_t() const {
+    return svset_neonq(svundef_f32(), values);
+  }
+#endif
  template <int64_t mask>
  static Vectorized<float> blend(
      const Vectorized<float>& a,
--- a/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_half_neon.h
@ -4,7 +4,6 @@
 // See Note [Do not compile initializers with AVX]

 #include <ATen/cpu/vec/intrinsics.h>
-#include <ATen/cpu/vec/vec128/vec128_convert.h>
 #include <ATen/cpu/vec/vec128/vec128_float_neon.h>
 #include <ATen/cpu/vec/vec128/vec128_reduced_precision_common_neon.h>
 #include <ATen/cpu/vec/vec_base.h>
@ -25,7 +24,6 @@ inline namespace CPU_CAPABILITY {
 //    https://bugs.llvm.org/show_bug.cgi?id=45824
 // Most likely we will do aarch32 support with inline asm.
 #if !defined(C10_MOBILE) && defined(__aarch64__)
-
 #ifdef __BIG_ENDIAN__
 #error "Big endian is not supported."
 #endif
@ -421,6 +419,24 @@ Vectorized<c10::Half> inline operator+(
 #endif
 }

+inline void load_fp32_from_fp16(const c10::Half* data, Vectorized<float>& out) {
+  __at_align__ float values[Vectorized<float>::size()];
+  for (const auto k : c10::irange(Vectorized<float>::size())) {
+    values[k] = data[k];
+  }
+  out = Vectorized<float>::loadu(values);
+}
+
+inline void load_fp32_from_fp16(
+    const c10::Half* data,
+    Vectorized<float>& out1,
+    Vectorized<float>& out2) {
+  Vectorized<c10::Half> f16_vec = Vectorized<c10::Half>::loadu(data);
+  auto floats = convert_half_float(f16_vec);
+  out1 = std::get<0>(floats);
+  out2 = std::get<1>(floats);
+}
+
 template <>
 Vectorized<c10::Half> inline operator-(
    const Vectorized<c10::Half>& a,
@ -656,6 +672,53 @@ Vectorized<c10::Half> inline fnmsub(
  return -a * b - c;
 #endif
 }
+
+#else
+
+#define CONVERT_NON_VECTORIZED_INIT(type, name)                     \
+  inline std::tuple<Vectorized<float>, Vectorized<float>>           \
+      convert_##name##_float(const Vectorized<type>& a) {           \
+    constexpr int64_t K = Vectorized<type>::size();                 \
+    __at_align__ float arr[K];                                      \
+    __at_align__ type arr2[K];                                      \
+    a.store(arr2);                                                  \
+    convert(arr2, arr, K);                                          \
+    return std::make_tuple(                                         \
+        Vectorized<float>::loadu(arr),                              \
+        Vectorized<float>::loadu(arr + Vectorized<float>::size())); \
+  }                                                                 \
+  inline Vectorized<type> convert_float_##name(                     \
+      const Vectorized<float>& a, const Vectorized<float>& b) {     \
+    constexpr int64_t K = Vectorized<type>::size();                 \
+    __at_align__ float arr[K];                                      \
+    __at_align__ type arr2[K];                                      \
+    a.store(arr);                                                   \
+    b.store(arr + Vectorized<float>::size());                       \
+    convert(arr, arr2, K);                                          \
+    return Vectorized<type>::loadu(arr2);                           \
+  }
+
+#define LOAD_FP32_NON_VECTORIZED_INIT(type, name)                           \
+  inline void load_fp32_from_##name(                                        \
+      const type* data, Vectorized<float>& out) {                           \
+    __at_align__ float values[Vectorized<float>::size()];                   \
+    for (const auto k : c10::irange(Vectorized<float>::size())) {           \
+      values[k] = data[k];                                                  \
+    }                                                                       \
+    out = Vectorized<float>::loadu(values);                                 \
+  }                                                                         \
+                                                                            \
+  inline void load_fp32_from_##name(                                        \
+      const type* data, Vectorized<float>& out1, Vectorized<float>& out2) { \
+    load_fp32_from_##name(data, out1);                                      \
+    data += Vectorized<float>::size();                                      \
+    load_fp32_from_##name(data, out2);                                      \
+  }
+
+CONVERT_NON_VECTORIZED_INIT(Half, half)
+
+LOAD_FP32_NON_VECTORIZED_INIT(Half, fp16)
+
 #endif // !defined(C10_MOBILE) && defined(__aarch64__)

 } // namespace CPU_CAPABILITY
--- a/aten/src/ATen/cpu/vec/vec256/vec256.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256.h
@ -9,21 +9,16 @@
 #if !(                                                 \
    defined(__VSX__) || defined(CPU_CAPABILITY_VSX) || \
    defined(CPU_CAPABILITY_ZVECTOR))
-#if defined(CPU_CAPABILITY_SVE256)
-#include <ATen/cpu/vec/sve/vec_common_sve.h>
-#else
-// clang-format off
-#include <ATen/cpu/vec/vec256/vec256_float.h>
 #include <ATen/cpu/vec/vec256/vec256_double.h>
+#include <ATen/cpu/vec/vec256/vec256_float.h>
 #include <ATen/cpu/vec/vec256/vec256_int.h>
 #include <ATen/cpu/vec/vec256/vec256_qint.h>
-#endif
 #if !defined(CPU_CAPABILITY_SVE256) || !defined(__ARM_FEATURE_BF16)
 #include <ATen/cpu/vec/vec256/vec256_bfloat16.h>
 #endif
-#include <ATen/cpu/vec/vec256/vec256_half.h>
-#include <ATen/cpu/vec/vec256/vec256_complex_float.h>
 #include <ATen/cpu/vec/vec256/vec256_complex_double.h>
+#include <ATen/cpu/vec/vec256/vec256_complex_float.h>
+#include <ATen/cpu/vec/vec256/vec256_half.h>
 // clang-format on
 #elif defined(__VSX__) || defined(CPU_CAPABILITY_VSX)
 #include <ATen/cpu/vec/vec256/vsx/vec256_common_vsx.h>
@ -56,34 +51,6 @@ namespace at::vec {
 // accessed as `at::vec`.
 inline namespace CPU_CAPABILITY {

-inline std::ostream& operator<<(std::ostream& stream, const c10::qint32& val) {
-  stream << val.val_;
-  return stream;
-}
-inline std::ostream& operator<<(std::ostream& stream, const c10::qint8& val) {
-  stream << static_cast<int>(val.val_);
-  return stream;
-}
-inline std::ostream& operator<<(std::ostream& stream, const c10::quint8& val) {
-  stream << static_cast<unsigned int>(val.val_);
-  return stream;
-}
-
-template <typename T>
-std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
-  T buf[Vectorized<T>::size()];
-  vec.store(buf);
-  stream << "vec[";
-  for (int i = 0; i != Vectorized<T>::size(); i++) {
-    if (i != 0) {
-      stream << ", ";
-    }
-    stream << buf[i];
-  }
-  stream << "]";
-  return stream;
-}
-
 #if defined(CPU_CAPABILITY_AVX2)

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
@ -268,9 +268,7 @@ LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16)

 #else // defined(CPU_CAPABILITY_AVX2)

-#if !(                                                                      \
-    defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
-    !defined(CPU_CAPABILITY_SVE256))
+#if !(defined(__aarch64__))
 CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16)
 #endif

--- a/aten/src/ATen/cpu/vec/vec256/vec256_half.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_half.h
@ -268,9 +268,7 @@ LOAD_FP32_VECTORIZED_INIT(Half, fp16)

 #else // defined(CPU_CAPABILITY_AVX2)

-#if !(                                                                      \
-    defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && \
-    !defined(CPU_CAPABILITY_SVE256))
+#if !defined(__aarch64__) || defined(CPU_CAPABILITY_SVE256)
 CONVERT_NON_VECTORIZED_INIT(Half, half)
 #endif

--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
@ -5,6 +5,13 @@

 #include <ATen/cpu/vec/intrinsics.h>
 #include <ATen/cpu/vec/vec_base.h>
+
+#ifdef __aarch64__
+#if defined(CPU_CAPABILITY_SVE128) || !defined(CPU_CAPABILITY_SVE)
+#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
+#endif
+#endif
+
 #include <ATen/native/quantized/AffineQuantizerBase.h>

 #include <c10/util/irange.h>
@ -915,7 +922,7 @@ Vectorized<c10::quint8> inline maximum(
  return a.maximum(b);
 }

-#elif !defined(CPU_CAPABILITY_SVE256)
+#else

 // NOTE: These are low-performance implementations that we fall back on
 // if we are not building with AVX2. This may not be an issue, because
@ -1372,12 +1379,18 @@ Vectorized<c10::quint8> inline maximum(
  return a.maximum(b);
 }

-#endif // if defined(CPU_CAPABILITY_AVX2)
-
-#if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))
+#if defined(__aarch64__) && \
+    (defined(CPU_CAPABILITY_SVE128) || !defined(CPU_CAPABILITY_SVE))
 std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(
    at::vec::Vectorized<int8_t> src) {
+
+#ifdef CPU_CAPABILITY_SVE
+  svint8_t x = src;
+  auto s8x8 = vget_low_s8(svget_neonq(x));
+#else
  auto s8x8 = vld1_s8(src.operator const int8_t*());
+#endif
+
  auto s16x8 = vmovl_s8(s8x8);

  auto s32x4_hi = vmovl_s16(vget_high_s16(s16x8));
@ -1402,7 +1415,14 @@ std::pair<Vectorized<float>, Vectorized<float>> inline convert_int8_to_float(

 Vectorized<float> inline convert_int8_half_register_to_float(
    at::vec::Vectorized<int8_t> src) {
+
+#ifdef CPU_CAPABILITY_SVE
+  svint8_t x = src;
+  auto s8x8 = vget_low_s8(svget_neonq(x));
+#else
  auto s8x8 = vld1_s8(src.operator const int8_t*());
+#endif
+
  auto s16x8 = vmovl_s8(s8x8);

  auto s32x4_lo = vmovl_s16(vget_low_s16(s16x8));
@ -1420,5 +1440,8 @@ Vectorized<float> inline convert_int8_half_register_to_float(
 }

 #endif
+
+#endif // if defined(CPU_CAPABILITY_AVX2)
+
 } // namespace CPU_CAPABILITY
 } // namespace at::vec
--- a/aten/src/ATen/cpu/vec/vec512/vec512.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512.h
@ -31,34 +31,6 @@ namespace vec {
 // See Note [CPU_CAPABILITY namespace]
 inline namespace CPU_CAPABILITY {

-inline std::ostream& operator<<(std::ostream& stream, const c10::qint32& val) {
-  stream << val.val_;
-  return stream;
-}
-inline std::ostream& operator<<(std::ostream& stream, const c10::qint8& val) {
-  stream << static_cast<int>(val.val_);
-  return stream;
-}
-inline std::ostream& operator<<(std::ostream& stream, const c10::quint8& val) {
-  stream << static_cast<unsigned int>(val.val_);
-  return stream;
-}
-
-template <typename T>
-std::ostream& operator<<(std::ostream& stream, const Vectorized<T>& vec) {
-  T buf[Vectorized<T>::size()];
-  vec.store(buf);
-  stream << "vec[";
-  for (int i = 0; i != Vectorized<T>::size(); i++) {
-    if (i != 0) {
-      stream << ", ";
-    }
-    stream << buf[i];
-  }
-  stream << "]";
-  return stream;
-}
-
 #if defined(CPU_CAPABILITY_AVX512)

 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512)
--- a/aten/src/ATen/cpu/vec/vec_base.h
+++ b/aten/src/ATen/cpu/vec/vec_base.h
@ -67,18 +67,7 @@ Windows llvm will not have this definition.
 #endif
 #define VECTOR_WIDTH 64
 #define int_vector __m512i
-#elif defined(__aarch64__) && \
-    !defined(CPU_CAPABILITY_SVE) // CPU_CAPABILITY_AVX512
-// SVE code expects 256-vectors; leave that set for SVE?
-#if defined(__GNUC__)
-#define __at_align__ __attribute__((aligned(16)))
-#elif defined(_WIN32)
-#define __at_align__ __declspec(align(16))
-#else
-#define __at_align__
-#endif
-#define VECTOR_WIDTH 16
-#else // CPU_CAPABILITY_AVX512
+#elif defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_SVE256)
 #if defined(__GNUC__)
 #define __at_align__ __attribute__((aligned(32)))
 #elif defined(_WIN32)
@ -88,7 +77,27 @@ Windows llvm will not have this definition.
 #endif
 #define VECTOR_WIDTH 32
 #define int_vector __m256i
-#endif // CPU_CAPABILITY_AVX512
+#elif defined(__aarch64__)
+// Define alignment and vector width for SVE128/Default (e.g., NEON)
+#if defined(__GNUC__)
+#define __at_align__ __attribute__((aligned(16)))
+#elif defined(_WIN32)
+#define __at_align__ __declspec(align(16))
+#else
+#define __at_align__
+#endif
+#define VECTOR_WIDTH 16
+#else
+// Fallback: define default alignment and vector width
+#if defined(__GNUC__)
+#define __at_align__ __attribute__((aligned(32)))
+#elif defined(_WIN32)
+#define __at_align__ __declspec(align(32))
+#else
+#define __at_align__
+#endif
+#define VECTOR_WIDTH 32
+#endif

 namespace at::vec {
 // See Note [CPU_CAPABILITY namespace]
--- a/aten/src/ATen/cpu/vec/vec_common_aarch64.h
+++ b/aten/src/ATen/cpu/vec/vec_common_aarch64.h
@ -8,13 +8,48 @@
 #include <ATen/cpu/vec/sve/sve_helper.h>
 #include <ATen/cpu/vec/vec_base.h>

-#if defined(CPU_CAPABILITY_SVE)
-#include <ATen/cpu/vec/sve/vec_bfloat16.h>
-#include <ATen/cpu/vec/sve/vec_double.h>
-#include <ATen/cpu/vec/sve/vec_float.h>
-#include <ATen/cpu/vec/sve/vec_int.h>
+#ifdef CPU_CAPABILITY_SVE128
+
+#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
+
+#include <ATen/cpu/vec/vec128/vec128_bfloat16_neon.h>
+
+#include <ATen/cpu/vec/vec128/vec128_half_neon.h>
+
+#include <ATen/cpu/vec/vec128/vec128_convert.h>
+
 #include <ATen/cpu/vec/sve/vec_qint.h>
-#endif
+
+#elif defined(CPU_CAPABILITY_SVE)
+
+#include <ATen/cpu/vec/sve/vec_float.h>
+
+#include <ATen/cpu/vec/sve/vec_bfloat16.h>
+
+#include <ATen/cpu/vec/sve/vec_double.h>
+#include <ATen/cpu/vec/sve/vec_int.h>
+
+#include <ATen/cpu/vec/sve/vec_qint.h>
+
+#include <ATen/cpu/vec/vec256/vec256_half.h>
+
+#include <ATen/cpu/vec/vec256/vec256_convert.h>
+
+#else // NEON
+
+#include <ATen/cpu/vec/vec128/vec128_float_neon.h>
+
+#include <ATen/cpu/vec/vec128/vec128_half_neon.h>
+
+#include <ATen/cpu/vec/vec128/vec128_bfloat16_neon.h>
+
+#include <ATen/cpu/vec/vec128/vec128_convert.h>
+
+#include <ATen/cpu/vec/vec256/vec256_qint.h>
+
+#endif // defined(CPU_CAPABILITY_SVE128)
+
+#include <ATen/cpu/vec/functional.h>

 namespace at::vec {
 // Note [CPU_CAPABILITY namespace]
@ -48,12 +83,6 @@ DEFINE_SVE_CAST(int32_t, s32, float, f32)
 DEFINE_SVE_CAST(int16_t, s16, float, f32)
 DEFINE_SVE_CAST(float, f32, double, f64)

-#ifdef __ARM_FEATURE_BF16
-DEFINE_SVE_CAST(int64_t, s64, c10::BFloat16, bf16)
-DEFINE_SVE_CAST(int32_t, s32, c10::BFloat16, bf16)
-DEFINE_SVE_CAST(int16_t, s16, c10::BFloat16, bf16)
-#endif // __ARM_FEATURE_BF16
-
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 template <int64_t scale = 1>
@ -173,9 +202,11 @@ std::pair<
  // group cols crossing lanes:
  //   return {a0, b0, a1, b1, a2, b2, a3, b3}
  //          {a4, b4, a5, b5, a6, b6, a7, b7}
-  return std::make_pair(
-      Vectorized<c10::BFloat16>(svzip1_bf16(a, b)),
-      Vectorized<c10::BFloat16>(svzip2_bf16(a, b)));
+  svbfloat16_t aReg = a;
+  svbfloat16_t bReg = b;
+  Vectorized<c10::BFloat16> c = svzip1_bf16(aReg, bReg);
+  Vectorized<c10::BFloat16> d = svzip2_bf16(aReg, bReg);
+  return std::make_pair(c, d);
 }
 #endif // __ARM_FEATURE_BF16

@ -224,12 +255,27 @@ std::pair<
  // swap lanes:
  //   return {a0, a1, a2, a3, a4, a5, a6, a7}
  //          {b0, b1, b2, b3, b4, b5, b6, b7}
-  return std::make_pair(
-      Vectorized<c10::BFloat16>(svuzp1_bf16((svbfloat16_t)a, (svbfloat16_t)b)),
-      Vectorized<c10::BFloat16>(svuzp2_bf16((svbfloat16_t)a, (svbfloat16_t)b)));
+  svbfloat16_t aReg = a;
+  svbfloat16_t bReg = b;
+  Vectorized<c10::BFloat16> c = svuzp1_bf16(aReg, bReg);
+  Vectorized<c10::BFloat16> d = svuzp2_bf16(aReg, bReg);
+  return std::make_pair(c, d);
 }
 #endif // __ARM_FEATURE_BF16

+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ FLIP ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#define DEFINE_FLIP_FUNC(type, sve_func)                    \
+  inline Vectorized<type> flip(const Vectorized<type>& v) { \
+    return Vectorized<type>(sve_func(v));                   \
+  }
+// Use the macro to define the flip functions
+DEFINE_FLIP_FUNC(float, svrev_f32)
+DEFINE_FLIP_FUNC(double, svrev_f64)
+DEFINE_FLIP_FUNC(int64_t, svrev_s64)
+DEFINE_FLIP_FUNC(int32_t, svrev_s32)
+DEFINE_FLIP_FUNC(int16_t, svrev_s16)
+DEFINE_FLIP_FUNC(int8_t, svrev_s8)
+
 #endif // defined(CPU_CAPABILITY_SVE)

 } // namespace CPU_CAPABILITY
--- a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp
@ -1157,103 +1157,103 @@ REGISTER_AVX512_DISPATCH(cholesky_stub, &cholesky_kernel)
 REGISTER_AVX2_DISPATCH(cholesky_stub, &cholesky_kernel)
 REGISTER_VSX_DISPATCH(cholesky_stub, &cholesky_kernel)
 REGISTER_ZVECTOR_DISPATCH(cholesky_stub, &cholesky_kernel)
-REGISTER_SVE256_DISPATCH(cholesky_stub, &cholesky_kernel)
+REGISTER_SVE_DISPATCH(cholesky_stub, &cholesky_kernel)

 REGISTER_ARCH_DISPATCH(cholesky_inverse_stub, DEFAULT, &cholesky_inverse_kernel_impl)
 REGISTER_AVX512_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
 REGISTER_AVX2_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
 REGISTER_VSX_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
 REGISTER_ZVECTOR_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
-REGISTER_SVE256_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)
+REGISTER_SVE_DISPATCH(cholesky_inverse_stub, &cholesky_inverse_kernel_impl)

 REGISTER_ARCH_DISPATCH(linalg_eig_stub, DEFAULT, &linalg_eig_kernel)
 REGISTER_AVX512_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
 REGISTER_AVX2_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
 REGISTER_VSX_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
 REGISTER_ZVECTOR_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
-REGISTER_SVE256_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)
+REGISTER_SVE_DISPATCH(linalg_eig_stub, &linalg_eig_kernel)

 REGISTER_ARCH_DISPATCH(linalg_eigh_stub, DEFAULT, &linalg_eigh_kernel)
 REGISTER_AVX512_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
 REGISTER_AVX2_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
 REGISTER_VSX_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
 REGISTER_ZVECTOR_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
-REGISTER_SVE256_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)
+REGISTER_SVE_DISPATCH(linalg_eigh_stub, &linalg_eigh_kernel)

 REGISTER_ARCH_DISPATCH(geqrf_stub, DEFAULT, &geqrf_kernel)
 REGISTER_AVX512_DISPATCH(geqrf_stub, &geqrf_kernel)
 REGISTER_AVX2_DISPATCH(geqrf_stub, &geqrf_kernel)
 REGISTER_VSX_DISPATCH(geqrf_stub, &geqrf_kernel)
 REGISTER_ZVECTOR_DISPATCH(geqrf_stub, &geqrf_kernel)
-REGISTER_SVE256_DISPATCH(geqrf_stub, &geqrf_kernel)
+REGISTER_SVE_DISPATCH(geqrf_stub, &geqrf_kernel)

 REGISTER_ARCH_DISPATCH(orgqr_stub, DEFAULT, &orgqr_kernel_impl)
 REGISTER_AVX512_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
 REGISTER_AVX2_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
 REGISTER_VSX_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
 REGISTER_ZVECTOR_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
-REGISTER_SVE256_DISPATCH(orgqr_stub, &orgqr_kernel_impl)
+REGISTER_SVE_DISPATCH(orgqr_stub, &orgqr_kernel_impl)

 REGISTER_ARCH_DISPATCH(ormqr_stub, DEFAULT, &ormqr_kernel)
 REGISTER_AVX512_DISPATCH(ormqr_stub, &ormqr_kernel)
 REGISTER_AVX2_DISPATCH(ormqr_stub, &ormqr_kernel)
 REGISTER_VSX_DISPATCH(ormqr_stub, &ormqr_kernel)
 REGISTER_ZVECTOR_DISPATCH(ormqr_stub, &ormqr_kernel)
-REGISTER_SVE256_DISPATCH(ormqr_stub, &ormqr_kernel)
+REGISTER_SVE_DISPATCH(ormqr_stub, &ormqr_kernel)

 REGISTER_ARCH_DISPATCH(lstsq_stub, DEFAULT, &lstsq_kernel)
 REGISTER_AVX512_DISPATCH(lstsq_stub, &lstsq_kernel)
 REGISTER_AVX2_DISPATCH(lstsq_stub, &lstsq_kernel)
 REGISTER_VSX_DISPATCH(lstsq_stub, &lstsq_kernel)
 REGISTER_ZVECTOR_DISPATCH(lstsq_stub, &lstsq_kernel)
-REGISTER_SVE256_DISPATCH(lstsq_stub, &lstsq_kernel)
+REGISTER_SVE_DISPATCH(lstsq_stub, &lstsq_kernel)

 REGISTER_ARCH_DISPATCH(triangular_solve_stub, DEFAULT, &triangular_solve_kernel)
 REGISTER_AVX512_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
 REGISTER_AVX2_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
 REGISTER_VSX_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
 REGISTER_ZVECTOR_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
-REGISTER_SVE256_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)
+REGISTER_SVE_DISPATCH(triangular_solve_stub, &triangular_solve_kernel)

 REGISTER_ARCH_DISPATCH(lu_factor_stub, DEFAULT, &lu_factor_kernel)
 REGISTER_AVX512_DISPATCH(lu_factor_stub, &lu_factor_kernel)
 REGISTER_AVX2_DISPATCH(lu_factor_stub, &lu_factor_kernel)
 REGISTER_VSX_DISPATCH(lu_factor_stub, &lu_factor_kernel)
 REGISTER_ZVECTOR_DISPATCH(lu_factor_stub, &lu_factor_kernel)
-REGISTER_SVE256_DISPATCH(lu_factor_stub, &lu_factor_kernel)
+REGISTER_SVE_DISPATCH(lu_factor_stub, &lu_factor_kernel)

 REGISTER_ARCH_DISPATCH(ldl_factor_stub, DEFAULT, &ldl_factor_kernel)
 REGISTER_AVX512_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
 REGISTER_AVX2_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
 REGISTER_VSX_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
 REGISTER_ZVECTOR_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
-REGISTER_SVE256_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)
+REGISTER_SVE_DISPATCH(ldl_factor_stub, &ldl_factor_kernel)

 REGISTER_ARCH_DISPATCH(ldl_solve_stub, DEFAULT, &ldl_solve_kernel)
 REGISTER_AVX512_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
 REGISTER_AVX2_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
 REGISTER_VSX_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
 REGISTER_ZVECTOR_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
-REGISTER_SVE256_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)
+REGISTER_SVE_DISPATCH(ldl_solve_stub, &ldl_solve_kernel)

 REGISTER_ARCH_DISPATCH(lu_solve_stub, DEFAULT, &lu_solve_kernel)
 REGISTER_AVX512_DISPATCH(lu_solve_stub, &lu_solve_kernel)
 REGISTER_AVX2_DISPATCH(lu_solve_stub, &lu_solve_kernel)
 REGISTER_VSX_DISPATCH(lu_solve_stub, &lu_solve_kernel)
 REGISTER_ZVECTOR_DISPATCH(lu_solve_stub, &lu_solve_kernel)
-REGISTER_SVE256_DISPATCH(lu_solve_stub, &lu_solve_kernel)
+REGISTER_SVE_DISPATCH(lu_solve_stub, &lu_solve_kernel)

 REGISTER_ARCH_DISPATCH(svd_stub, DEFAULT, &svd_kernel)
 REGISTER_AVX512_DISPATCH(svd_stub, &svd_kernel)
 REGISTER_AVX2_DISPATCH(svd_stub, &svd_kernel)
 REGISTER_VSX_DISPATCH(svd_stub, &svd_kernel)
 REGISTER_ZVECTOR_DISPATCH(svd_stub, &svd_kernel)
-REGISTER_SVE256_DISPATCH(svd_stub, &svd_kernel)
+REGISTER_SVE_DISPATCH(svd_stub, &svd_kernel)

 REGISTER_ARCH_DISPATCH(unpack_pivots_stub, DEFAULT, &unpack_pivots_cpu_kernel)
 REGISTER_AVX512_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
 REGISTER_AVX2_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
 REGISTER_VSX_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
 REGISTER_ZVECTOR_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
-REGISTER_SVE256_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
+REGISTER_SVE_DISPATCH(unpack_pivots_stub, &unpack_pivots_cpu_kernel)
 } // namespace at::native
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -406,11 +406,23 @@ struct ConvParams {
  // cudnn and miopen are guaranteed not to be on mobile, and T102591915 / T110194934 suggest
  // that maybe the compiledWithCuDNN() check sometimes segfaults (though I can't imagine how)
 #if !defined(C10_MOBILE)
-    if (!detail::getCUDAHooks().compiledWithCuDNN()) {
+    if (!detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda() || !cudnn_enabled) {
      return false;
    }
+    static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
+    // broken on cuDNN 9.8
+    if (cudnn_version >= 90800) {
+      if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous &&
+          (input.scalar_type() == at::kBFloat16 || input.scalar_type() == at::kHalf) &&
+          weight.dim() == 5) {
+        for (int i = 2; i < weight.dim(); i++) {
+          if (weight.size(i) != 1) {
+            return false;
+          }
+        }
+      }
+    }
    if (needs_64bit_indexing_no_split(input, weight)) {
-      static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
      if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) {
        TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions"
                        " if the V8 API is not enabled or before cuDNN version 9.3+."
@ -418,9 +430,6 @@ struct ConvParams {
        return false;
      }
    }
-    if (!input.is_cuda() || !cudnn_enabled) {
-      return false;
-    }
    if (input.scalar_type() == at::kBFloat16 || weight.scalar_type() == at::kBFloat16) {
      if (!(detail::getCUDAHooks().supportsBFloat16ConvolutionWithCuDNNv8() && at::native::cudnnv8_enabled_check_debug())) {
        return false;
@ -439,16 +448,19 @@ struct ConvParams {

  // Use cudnn for FP16 depthwise convolutions
  bool use_cudnn_depthwise(const at::Tensor& input, const at::Tensor& weight) const  {
-    if (!detail::getCUDAHooks().compiledWithCuDNN()) {
+    if (!cudnn_enabled || !detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda()) {
      return false;
    }
-    if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous && use_cudnn(input, weight)) {
-      // always use cudnn_depthwise for channels_last format
-      return true;
-    }
    // native kernel doesn't support 64-bit non-splittable case
-    if (cudnn_enabled && !(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) {
+    if (!(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) {
      static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionCuDNN() : -1;
+      // TODO(eqy): remove this once cuDNN fixes 64-bit depthwise support, first broken in 9.11x
+      if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) {
+        if (cudnn_version < 0 || cudnn_version > 91000) {
+          return false;
+        }
+      }
+
      if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) {
        TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions"
                        " if the V8 API is not enabled or before cuDNN version 9.3+."
@ -458,6 +470,10 @@ struct ConvParams {
        return true;
      }
    }
+    if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) {
+      // always use cudnn_depthwise for channels_last format
+      return true;
+    }
    if (detail::getCUDAHooks().supportsDepthwiseConvolutionWithCuDNN()) {
      bool kernel_cond =  (use_cudnn(input, weight) &&
                           input.scalar_type() == kHalf && // only for FP16
--- a/aten/src/ATen/native/DispatchStub.cpp
+++ b/aten/src/ATen/native/DispatchStub.cpp
@ -39,19 +39,21 @@ static CPUCapability compute_cpu_capability() {
    }
 #elif defined(HAVE_SVE_CPU_DEFINITION)
    int sve_vl = cpuinfo_get_max_arm_sve_length(); //Returns maximum SVE VL supported by your HW.
-#ifdef HAVE_SVE256_CPU_DEFINITION
-    if (envar == "sve256") {
+    if (envar == "sve") {
+      // Select SVE capability based on the maximum SVE VL supported by the HW.
      if (sve_vl == 256) {
-#ifdef HAVE_ARM_BF16_CPU_DEFINITION
        if (cpuinfo_has_arm_bf16()) {
          return CPUCapability::SVE256;
        }
-#endif
+      } else if (sve_vl == 128) {
+        if (cpuinfo_has_arm_bf16()) {
+          return CPUCapability::SVE128;
+        }
+      } else {
+        TORCH_WARN("SVE capability not available on hardware. Falling back to DEFAULT");
+        return CPUCapability::DEFAULT;
      }
-      TORCH_WARN("SVE256 capability not available on hardware. Falling back to DEFAULT");
-      return CPUCapability::DEFAULT;
    }
-#endif
 #else
 #ifdef HAVE_AVX512_CPU_DEFINITION
    if (envar == "avx512") {
@ -113,6 +115,11 @@ static CPUCapability compute_cpu_capability() {
        #endif
        }
    #endif
+    #ifdef HAVE_SVE128_CPU_DEFINITION
+        if (sve_vl == 128) { // Check for SVE128
+            return CPUCapability::SVE128;
+        }
+    #endif
    // Return the default CPU capability.
    return CPUCapability::DEFAULT;
  }
@ -147,6 +154,9 @@ DispatchResult DispatchStubImpl::try_get_call_ptr(
 #ifdef HAVE_SVE256_CPU_DEFINITION
  , void *SVE256
 #endif
+#ifdef HAVE_SVE128_CPU_DEFINITION
+  , void *SVE128
+#endif
 ) {
  constexpr auto supported_devices = c10::array_of<c10::DeviceType>(
        c10::DeviceType::CPU,
@ -184,6 +194,9 @@ DispatchResult DispatchStubImpl::try_get_call_ptr(
 #endif
 #ifdef HAVE_SVE256_CPU_DEFINITION
          , SVE256
+#endif
+#ifdef HAVE_SVE128_CPU_DEFINITION
+          , SVE128
 #endif
        );
        if (!std::holds_alternative<ErrorType>(result)) {
@ -242,6 +255,9 @@ void* DispatchStubImpl::get_call_ptr(
 #ifdef HAVE_SVE256_CPU_DEFINITION
  , void *SVE256
 #endif
+#ifdef HAVE_SVE128_CPU_DEFINITION
+  , void *SVE128
+#endif
 ) {

  auto result = try_get_call_ptr(
@ -266,6 +282,10 @@ void* DispatchStubImpl::get_call_ptr(
 #ifdef HAVE_SVE256_CPU_DEFINITION
      ,
      SVE256
+#endif
+#ifdef HAVE_SVE128_CPU_DEFINITION
+      ,
+      SVE128
 #endif
  );
  if (std::holds_alternative<ErrorType>(result)) {
@ -300,6 +320,9 @@ DispatchResult DispatchStubImpl::try_choose_cpu_impl(
 #endif
 #ifdef HAVE_SVE256_CPU_DEFINITION
    , void *SVE256
+#endif
+#ifdef HAVE_SVE128_CPU_DEFINITION
+    , void *SVE128
 #endif
  ){

@ -342,6 +365,16 @@ DispatchResult DispatchStubImpl::try_choose_cpu_impl(
      return DispatchResult(SVE256);
    }
  }
+#endif
+#ifdef HAVE_SVE128_CPU_DEFINITION
+  if (capability >= static_cast<int>(CPUCapability::SVE128)) {
+    if (C10_UNLIKELY(!SVE128)) {
+      // dispatch to DEFAULT, since the SVE kernel is missing
+      return DEFAULT != nullptr ? DispatchResult(DEFAULT) : ErrorType::MissingDeviceKernel;
+    } else {
+      return DispatchResult(SVE128);
+    }
+  }
 #endif
  return DEFAULT != nullptr ? DispatchResult(DEFAULT) : ErrorType::MissingDeviceKernel;
 }
@ -363,6 +396,9 @@ void* DispatchStubImpl::choose_cpu_impl(
 #ifdef HAVE_SVE256_CPU_DEFINITION
  , void *SVE256
 #endif
+#ifdef HAVE_SVE128_CPU_DEFINITION
+  , void *SVE128
+#endif
 ) {
  auto capability = static_cast<int>(get_cpu_capability());
  (void)capability;
@ -408,6 +444,17 @@ void* DispatchStubImpl::choose_cpu_impl(
      return SVE256;
    }
  }
+#endif
+#ifdef HAVE_SVE128_CPU_DEFINITION
+  if (capability >= static_cast<int>(CPUCapability::SVE128)) {
+    if (C10_UNLIKELY(!SVE128)) {
+      // dispatch to DEFAULT, since the SVE kernel is missing
+      TORCH_INTERNAL_ASSERT(DEFAULT, "DispatchStub: missing default kernel");
+      return DEFAULT;
+    } else {
+      return SVE128;
+    }
+  }
 #endif
  TORCH_INTERNAL_ASSERT(DEFAULT, "DispatchStub: missing default kernel");
  return DEFAULT;
--- a/aten/src/ATen/native/DispatchStub.h
+++ b/aten/src/ATen/native/DispatchStub.h
@ -64,8 +64,9 @@ enum class CPUCapability {
  VSX = 1,
 #elif defined(HAVE_ZVECTOR_CPU_DEFINITION)
  ZVECTOR = 1,
-#elif defined(HAVE_SVE256_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
+#elif defined(HAVE_SVE_CPU_DEFINITION) && defined(HAVE_ARM_BF16_CPU_DEFINITION)
  SVE256 = 1,
+  SVE128 = 2,
 #else
  AVX2 = 1,
  AVX512 = 2,
@ -117,6 +118,9 @@ struct TORCH_API DispatchStubImpl {
 #endif
 #ifdef HAVE_SVE256_CPU_DEFINITION
      , void *SVE256
+#endif
+#ifdef HAVE_SVE128_CPU_DEFINITION
+      , void *SVE128
 #endif
  );

@ -138,6 +142,9 @@ struct TORCH_API DispatchStubImpl {
 #endif
 #ifdef HAVE_SVE256_CPU_DEFINITION
    , void *SVE256
+#endif
+#ifdef HAVE_SVE128_CPU_DEFINITION
+    , void *SVE128
 #endif
  );

@ -159,6 +166,9 @@ struct TORCH_API DispatchStubImpl {
 #endif
 #ifdef HAVE_SVE256_CPU_DEFINITION
      , void *SVE256
+#endif
+#ifdef HAVE_SVE128_CPU_DEFINITION
+      , void *SVE128
 #endif
  );

@ -183,6 +193,9 @@ struct TORCH_API DispatchStubImpl {
 #endif
 #ifdef HAVE_SVE256_CPU_DEFINITION
    , void *SVE256
+#endif
+#ifdef HAVE_SVE128_CPU_DEFINITION
+    , void *SVE128
 #endif
  );

@ -240,6 +253,9 @@ private:
 #endif
 #ifdef HAVE_SVE256_CPU_DEFINITION
      , reinterpret_cast<void*>(SVE256)
+#endif
+#ifdef HAVE_SVE128_CPU_DEFINITION
+      , reinterpret_cast<void*>(SVE128)
 #endif
      )
    );
@ -301,6 +317,9 @@ public:
 #endif
 #ifdef HAVE_SVE256_CPU_DEFINITION
      , reinterpret_cast<void*>(SVE256)
+#endif
+#ifdef HAVE_SVE128_CPU_DEFINITION
+      , reinterpret_cast<void*>(SVE128)
 #endif
      );
    if (std::holds_alternative<ErrorType>(result)){
@ -325,6 +344,9 @@ public:
 #ifdef HAVE_SVE256_CPU_DEFINITION
  static TORCH_API FnPtr SVE256;
 #endif
+#ifdef HAVE_SVE128_CPU_DEFINITION
+  static TORCH_API FnPtr SVE128;
+#endif
 private:
  DispatchStubImpl impl;
 };
@ -432,6 +454,12 @@ struct RegisterPRIVATEUSE1Dispatch {
 #define REGISTER_SVE256_DISPATCH(name, fn)
 #endif

+#ifdef HAVE_SVE128_CPU_DEFINITION
+#define REGISTER_SVE128_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, SVE128, fn)
+#else
+#define REGISTER_SVE128_DISPATCH(name, fn)
+#endif
+
 // Macro to register the same kernel for all CPU arch types. This is useful
 // if a kernel does not benefit from being recompiled across different arch types.
 #define REGISTER_ALL_CPU_DISPATCH(name, fn)                                    \
@ -440,6 +468,11 @@ struct RegisterPRIVATEUSE1Dispatch {
  REGISTER_AVX2_DISPATCH(name, fn)                                             \
  REGISTER_VSX_DISPATCH(name, fn)                                              \
  REGISTER_ZVECTOR_DISPATCH(name, fn)                                          \
+  REGISTER_SVE256_DISPATCH(name, fn)                                           \
+  REGISTER_SVE128_DISPATCH(name, fn)
+
+#define REGISTER_SVE_DISPATCH(name, fn)                                        \
+  REGISTER_SVE128_DISPATCH(name, fn)                                           \
  REGISTER_SVE256_DISPATCH(name, fn)

 #define REGISTER_NO_CPU_DISPATCH(name)                                         \
@ -482,6 +515,7 @@ struct RegisterPRIVATEUSE1Dispatch {
 // REGISTER_DISPATCH now dispatches an AVX512 kernel to nullptr but registers other dispatches.
 // ALSO_REGISTER_AVX512_DISPATCH should be used for ensuring AVX512 dispatch, among others.
 // ALSO_REGISTER_SVE256_DISPATCH should be used for ensuring SVE256 dispatch, among others.
+// ALSO_REGISTER_SVE128_DISPATCH should be used for ensuring SVE128 dispatch, among others.
 #ifdef CPU_CAPABILITY_AVX512
 #define REGISTER_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, ((void*)(fn) ? nullptr : nullptr))
 #else
@ -489,6 +523,7 @@ struct RegisterPRIVATEUSE1Dispatch {
 #endif
 #define ALSO_REGISTER_AVX512_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
 #define ALSO_REGISTER_SVE256_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
+#define ALSO_REGISTER_SVE128_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, CPU_CAPABILITY, fn)
 #endif
 } // namespace at::native

--- a/aten/src/ATen/native/SegmentReduce.cpp
+++ b/aten/src/ATen/native/SegmentReduce.cpp
@ -466,7 +466,7 @@ REGISTER_AVX2_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cp
 REGISTER_AVX512_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
 REGISTER_VSX_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
 REGISTER_ZVECTOR_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
-REGISTER_SVE256_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)
+REGISTER_SVE_DISPATCH(_segment_reduce_lengths_stub, &_segment_reduce_lengths_cpu_kernel)

 // offsets dispatches
 REGISTER_ARCH_DISPATCH(
@ -477,7 +477,7 @@ REGISTER_AVX2_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cp
 REGISTER_AVX512_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
 REGISTER_VSX_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
 REGISTER_ZVECTOR_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
-REGISTER_SVE256_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)
+REGISTER_SVE_DISPATCH(_segment_reduce_offsets_stub, &_segment_reduce_offsets_cpu_kernel)

 // Currently some computation is being duplicated across forward and backward.
 // TODO: Cache indices in forward pass to reuse in backward
@ -548,7 +548,7 @@ REGISTER_VSX_DISPATCH(
 REGISTER_ZVECTOR_DISPATCH(
    _segment_reduce_lengths_backward_stub,
    &_segment_reduce_cpu_lengths_backward_kernel)
-REGISTER_SVE256_DISPATCH(
+REGISTER_SVE_DISPATCH(
    _segment_reduce_lengths_backward_stub,
    &_segment_reduce_cpu_lengths_backward_kernel)

@ -568,7 +568,7 @@ REGISTER_VSX_DISPATCH(
 REGISTER_ZVECTOR_DISPATCH(
    _segment_reduce_offsets_backward_stub,
    &_segment_reduce_cpu_offsets_backward_kernel)
-REGISTER_SVE256_DISPATCH(
+REGISTER_SVE_DISPATCH(
    _segment_reduce_offsets_backward_stub,
    &_segment_reduce_cpu_offsets_backward_kernel)

--- a/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReducedPrecisionFloatGemvFastPathKernel.cpp
@ -212,7 +212,7 @@ std::pair<vec::Vectorized<float>, vec::Vectorized<float>> fmadd(
    const vec::Vectorized<c10::Half>& b,
    const vec::Vectorized<float>& acc_low,
    const vec::Vectorized<float>& acc_high) {
-#if defined(__ARM_FEATURE_FP16_FML) && !defined(CPU_CAPABILITY_SVE)
+#if defined(__aarch64__) && ((defined(__ARM_FEATURE_FP16_FML) && !defined(__ARM_FEATURE_SVE)) || (defined(CPU_CAPABILITY_SVE128)))
  return std::make_pair(vfmlalq_low_f16(acc_low, a, b), vfmlalq_high_f16(acc_high, a, b));
 #else
  const auto [a_float_low, a_float_high] = convert_half_float(a);
@ -233,7 +233,7 @@ std::pair<vec::Vectorized<float>, vec::Vectorized<float>> fmadd(

 // Return a + b_low * c_low + b_high * c_high
 vec::Vectorized<float> fmadd(vec::Vectorized<float> a, vec::Vectorized<Half> b, vec::Vectorized<Half> c) {
-#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_FML) && !defined(__ARM_FEATURE_SVE)
+#if defined(__aarch64__) && ((defined(__ARM_FEATURE_FP16_FML) && !defined(__ARM_FEATURE_SVE)) || (defined(CPU_CAPABILITY_SVE128)))
  // NOTE: this instruction is an optional instruction in ARM v8.2 and
  // v8.3, but mandatory in v8.4 per
  // https://developer.arm.com/documentation/ddi0596/2021-03/SIMD-FP-Instructions/FMLAL--FMLAL2--vector---Floating-point-fused-Multiply-Add-Long-to-accumulator--vector--?lang=en
--- a/aten/src/ATen/native/cuda/IndexKernel.cu
+++ b/aten/src/ATen/native/cuda/IndexKernel.cu
@ -5,6 +5,7 @@
 #include <array>
 #include <type_traits>
 #include <ATen/core/TensorBase.h>
+#include <ATen/ceil_div.h>
 #include <ATen/Dispatch.h>
 #include <ATen/Dispatch_v2.h>
 #include <ATen/cuda/CUDAContext.h>
@ -83,11 +84,17 @@ void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, co
        auto ind_dim_size = index_size[0];
        auto inp_stride_bytes = index_stride[0];
        auto out_stride_bytes = iter.strides(0)[1];
-        if (iter.numel() == 0) return;
-        at::native::vectorized_gather_kernel_launch<alignment, int64_t>(out_ptr, in_ptr, (int64_t*)iter.data_ptr(2), num_ind,
-        slice_size, ind_dim_size, inp_stride_bytes, out_stride_bytes, /*allow_neg_indices*/true);
-        return;
-      }
+        // avoid grid overflow in the fast kernel
+        const int64_t vec_chunks = ceil_div(slice_size, alignment);
+        const int64_t blocks_per_slice_upper = ceil_div(vec_chunks, (int64_t)launch_size_nd);
+        const int max_grid_y = at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
+        // if it's an eligible grid we use the fast path, otherwise default to slower path
+        if (blocks_per_slice_upper <= max_grid_y) {
+          at::native::vectorized_gather_kernel_launch<alignment, int64_t>(out_ptr, in_ptr, (int64_t*)iter.data_ptr(2), num_ind,
+          slice_size, ind_dim_size, inp_stride_bytes, out_stride_bytes, /*allow_neg_indices*/true);
+          return;
+        }
+    }
  }

  auto sizes = std::array<int64_t, MAX_DIMS>{};
--- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp
@ -1238,7 +1238,7 @@ Tensor _cholesky_solve_helper_cuda_magma(const Tensor& self, const Tensor& A, bo
 // Todo: cusolverDn<T>potrsBatched only supports nrhs == 1 and does not have good performance.
 //     Batched cholesky_solve is dispatched to magma.
 Tensor _cholesky_solve_helper_cuda(const Tensor& self, const Tensor& A, bool upper) {
-#if defined(USE_LINALG_SOLVER) && !defined(USE_ROCM)
+#if defined(USE_LINALG_SOLVER)
  auto preferred_backend = at::globalContext().linalgPreferredBackend();
  switch (preferred_backend) {
    case at::LinalgBackend::Cusolver:
@ -1352,7 +1352,7 @@ void cholesky_helper_magma(const Tensor& input, bool upper, const Tensor& info)
 }

 static void cholesky_kernel(const Tensor& input, const Tensor& info, bool upper) {
-#if defined(USE_LINALG_SOLVER) && !defined(USE_ROCM)
+#if defined(USE_LINALG_SOLVER)
  auto preferred_backend = at::globalContext().linalgPreferredBackend();
  switch (preferred_backend) {
    case at::LinalgBackend::Cusolver:
@ -2709,7 +2709,7 @@ void linalg_lstsq_gels(const Tensor& A, const Tensor& B, const Tensor& /*infos*/
 }

 void gels_looped(const Tensor& a, Tensor& b, Tensor& infos) {
-#if defined(USE_LINALG_SOLVER) && !defined(USE_ROCM)
+#if defined(USE_LINALG_SOLVER)
  auto preferred_backend = at::globalContext().linalgPreferredBackend();
  switch (preferred_backend) {
    case at::LinalgBackend::Magma:
@ -2733,7 +2733,7 @@ void lstsq_kernel(const Tensor& a, Tensor& b, Tensor& /*rank*/, Tensor& /*singul
  // first handle the underdetermined case (m < n)
  // this case is not supported by MAGMA or cuBLAS
  if (m < n) {
-#if defined(USE_LINALG_SOLVER) && !defined(USE_ROCM)
+#if defined(USE_LINALG_SOLVER)
    linalg_lstsq_gels(a, b, infos);
 #else
    TORCH_CHECK(
--- a/aten/src/ATen/native/mkl/SpectralOps.cpp
+++ b/aten/src/ATen/native/mkl/SpectralOps.cpp
@ -165,7 +165,7 @@ REGISTER_AVX2_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_co
 REGISTER_AVX512_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cpu_)
 REGISTER_ZVECTOR_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cpu_)
 REGISTER_VSX_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cpu_)
-REGISTER_SVE256_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cpu_)
+REGISTER_SVE_DISPATCH(fft_fill_with_conjugate_symmetry_stub, &_fft_fill_with_conjugate_symmetry_cpu_)

 // _out variants can be shared between PocketFFT and MKL
 Tensor& _fft_r2c_mkl_out(const Tensor& self, IntArrayRef dim, int64_t normalization,
--- a/aten/src/ATen/native/mps/kernels/Attention.metal
+++ b/aten/src/ATen/native/mps/kernels/Attention.metal
@ -14,8 +14,8 @@ template <typename T, int D, int V = D>
    device T* out [[buffer(3)]],
    const constant uint& gqa_factor [[buffer(4)]],
    const constant uint& N [[buffer(5)]],
-    const constant uint2& k_head_seq_stride [[buffer(6)]],
-    const constant uint2& v_head_seq_stride [[buffer(7)]],
+    const constant uint3& qkv_head_strides [[buffer(6)]],
+    const constant uint3& qkv_seq_strides [[buffer(7)]],
    const constant float& scale [[buffer(8)]],
    const device bool* mask [[buffer(9)]],
    const constant uint3& mask_strides [[buffer(10)]],
@ -28,10 +28,12 @@ template <typename T, int D, int V = D>
  constexpr uint BD = 32;
  constexpr uint qk_per_thread = D / BD;
  constexpr uint v_per_thread = V / BD;
-  const uint k_head_stride = k_head_seq_stride.x;
-  const uint k_seq_stride = k_head_seq_stride.y;
-  const uint v_head_stride = v_head_seq_stride.x;
-  const uint v_seq_stride = v_head_seq_stride.y;
+  const uint q_head_stride = qkv_head_strides.x;
+  const uint q_seq_stride = qkv_seq_strides.x;
+  const uint k_head_stride = qkv_head_strides.y;
+  const uint k_seq_stride = qkv_seq_strides.y;
+  const uint v_head_stride = qkv_head_strides.z;
+  const uint v_seq_stride = qkv_seq_strides.z;
  const uint mask_head_stride = mask_strides.x;
  const uint mask_kv_seq_stride = mask_strides.y;
  const uint mask_q_seq_stride = mask_strides.z;
@ -54,9 +56,9 @@ template <typename T, int D, int V = D>
  const int kv_head_idx = head_idx / gqa_factor;
  const int Q = tpg.y;
  const int group_offset = head_idx * Q + q_seq_idx;
-  const int q_offset = group_offset;
  const int o_offset = group_offset;
-  queries += q_offset * D + simd_lid * qk_per_thread;
+  queries += head_idx * q_head_stride + q_seq_idx * q_seq_stride +
+      simd_lid * qk_per_thread;
  keys += kv_head_idx * k_head_stride + simd_gid * k_seq_stride +
      simd_lid * qk_per_thread;
  values += kv_head_idx * v_head_stride + simd_gid * v_seq_stride +
@ -156,8 +158,8 @@ template <typename T, int D, int V = D>
    device float* maxs [[buffer(5)]],
    const constant uint& gqa_factor [[buffer(6)]],
    const constant uint& N [[buffer(7)]],
-    const constant uint2& k_head_seq_stride [[buffer(8)]],
-    const constant uint2& v_head_seq_stride [[buffer(9)]],
+    const constant uint3& qkv_head_strides [[buffer(8)]],
+    const constant uint3& qkv_seq_strides [[buffer(9)]],
    const constant float& scale [[buffer(10)]],
    const device bool* mask [[buffer(11)]],
    const constant uint3& mask_strides [[buffer(12)]],
@ -170,10 +172,12 @@ template <typename T, int D, int V = D>
  constexpr int BD = 32;
  constexpr int qk_per_thread = D / BD;
  constexpr int v_per_thread = V / BD;
-  const int k_head_stride = k_head_seq_stride.x;
-  const int k_seq_stride = k_head_seq_stride.y;
-  const int v_head_stride = v_head_seq_stride.x;
-  const int v_seq_stride = v_head_seq_stride.y;
+  const int q_head_stride = qkv_head_strides.x;
+  const int q_seq_stride = qkv_seq_strides.x;
+  const int k_head_stride = qkv_head_strides.y;
+  const int k_seq_stride = qkv_seq_strides.y;
+  const int v_head_stride = qkv_head_strides.z;
+  const int v_seq_stride = qkv_seq_strides.z;
  const int mask_kv_seq_stride = mask_strides.x;
  const int mask_q_seq_stride = mask_strides.y;
  const int mask_head_stride = mask_strides.z;
@ -196,10 +200,10 @@ template <typename T, int D, int V = D>
  const int head_idx = tid.x;
  const int q_seq_idx = tid.y;
  const int o_offset = head_idx * tpg.y + q_seq_idx;
-  const int q_offset = o_offset;
  const int kv_head_idx = head_idx / gqa_factor;

-  queries += q_offset * D + simd_lid * qk_per_thread;
+  queries += head_idx * q_head_stride + q_seq_idx * q_seq_stride +
+      simd_lid * qk_per_thread;
  keys += kv_head_idx * k_head_stride +
      (block_idx * BN + simd_gid) * k_seq_stride + simd_lid * qk_per_thread;
  values += kv_head_idx * v_head_stride +
@ -520,25 +524,25 @@ kernel void attention(
  }
 }

-#define INSTANTIATE_SDPA_VECTOR(DTYPE, QK_DIM, VALUE_DIM)    \
-  template [[host_name("sdpa_vector_" #DTYPE "_" #QK_DIM     \
-                       "_" #VALUE_DIM)]] kernel void         \
-  sdpa_vector<DTYPE, QK_DIM, VALUE_DIM>(                     \
-      const device DTYPE* queries [[buffer(0)]],             \
-      const device DTYPE* keys [[buffer(1)]],                \
-      const device DTYPE* values [[buffer(2)]],              \
-      device DTYPE* out [[buffer(3)]],                       \
-      const constant uint& gqa_factor [[buffer(4)]],         \
-      const constant uint& N [[buffer(5)]],                  \
-      const constant uint2& k_head_seq_stride [[buffer(6)]], \
-      const constant uint2& v_head_seq_stride [[buffer(7)]], \
-      const constant float& scale [[buffer(8)]],             \
-      const device bool* mask [[buffer(9)]],                 \
-      const constant uint3& mask_strides [[buffer(10)]],     \
-      const constant bool& has_mask [[buffer(11)]],          \
-      uint3 tid [[threadgroup_position_in_grid]],            \
-      uint3 tpg [[threadgroups_per_grid]],                   \
-      uint simd_gid [[simdgroup_index_in_threadgroup]],      \
+#define INSTANTIATE_SDPA_VECTOR(DTYPE, QK_DIM, VALUE_DIM)   \
+  template [[host_name("sdpa_vector_" #DTYPE "_" #QK_DIM    \
+                       "_" #VALUE_DIM)]] kernel void        \
+  sdpa_vector<DTYPE, QK_DIM, VALUE_DIM>(                    \
+      const device DTYPE* queries [[buffer(0)]],            \
+      const device DTYPE* keys [[buffer(1)]],               \
+      const device DTYPE* values [[buffer(2)]],             \
+      device DTYPE* out [[buffer(3)]],                      \
+      const constant uint& gqa_factor [[buffer(4)]],        \
+      const constant uint& N [[buffer(5)]],                 \
+      const constant uint3& qkv_head_strides [[buffer(6)]], \
+      const constant uint3& qkv_seq_strides [[buffer(7)]],  \
+      const constant float& scale [[buffer(8)]],            \
+      const device bool* mask [[buffer(9)]],                \
+      const constant uint3& mask_strides [[buffer(10)]],    \
+      const constant bool& has_mask [[buffer(11)]],         \
+      uint3 tid [[threadgroup_position_in_grid]],           \
+      uint3 tpg [[threadgroups_per_grid]],                  \
+      uint simd_gid [[simdgroup_index_in_threadgroup]],     \
      uint simd_lid [[thread_index_in_simdgroup]]);

 #define INSTANTIATE_SDPA_VECTOR_2PASS_1(DTYPE, QK_DIM, VALUE_DIM) \
@ -553,8 +557,8 @@ kernel void attention(
      device float* maxs [[buffer(5)]],                           \
      const constant uint& gqa_factor [[buffer(6)]],              \
      const constant uint& N [[buffer(7)]],                       \
-      const constant uint2& k_head_seq_stride [[buffer(8)]],      \
-      const constant uint2& v_head_seq_stride [[buffer(9)]],      \
+      const constant uint3& qkv_head_strides [[buffer(8)]],       \
+      const constant uint3& qkv_seq_strides [[buffer(9)]],        \
      const constant float& scale [[buffer(10)]],                 \
      const device bool* mask [[buffer(11)]],                     \
      const constant uint3& mask_strides [[buffer(12)]],          \
--- a/aten/src/ATen/native/mps/kernels/GridSampler.metal
+++ b/aten/src/ATen/native/mps/kernels/GridSampler.metal
@ -223,9 +223,6 @@ void grid_sampler_single_element(
    auto input_size = input_sizes[input_dim];
    auto coord = static_cast<opmath_t<T>>(coords[coord_dim]);

-    // Interpret nan as -1
-    coord = isnan(coord) ? -1 : coord;
-
    if (!align_corners) {
      // Map unaligned grid space to aligned grid space
      auto corner_alignment_factor = static_cast<opmath_t<T>>(input_size) /
--- a/aten/src/ATen/native/mps/operations/Attention.mm
+++ b/aten/src/ATen/native/mps/operations/Attention.mm
@ -182,6 +182,8 @@ static std::tuple<Tensor, Tensor> sdpa_vector_fast_mps(const Tensor& q_,
  uint maxSeqLength = k_.size(2);
  uint N = k_.size(2);
  uint B = q_.size(0) * q_.size(1);
+  uint q_head_stride = q_.stride(1);
+  uint q_seq_stride = q_.stride(2);
  uint k_head_stride = k_.stride(1);
  uint k_seq_stride = k_.stride(2);
  uint v_head_stride = v_.stride(1);
@ -209,8 +211,8 @@ static std::tuple<Tensor, Tensor> sdpa_vector_fast_mps(const Tensor& q_,
                  out,
                  1,
                  N,
-                  std::array<uint32_t, 2>{k_head_stride, k_seq_stride},
-                  std::array<uint32_t, 2>{v_head_stride, v_seq_stride},
+                  std::array<uint32_t, 3>{q_head_stride, k_head_stride, v_head_stride},
+                  std::array<uint32_t, 3>{q_seq_stride, k_seq_stride, v_seq_stride},
                  scale_factor);

      if (has_mask) {
@ -257,6 +259,8 @@ static std::tuple<Tensor, Tensor> sdpa_vector_2pass_mps(const Tensor& q_,
  uint B = batchSize * num_heads;
  uint gqa_factor = q_.size(1) / k_.size(1);

+  uint q_head_stride = q_.stride(1);
+  uint q_seq_stride = q_.stride(2);
  uint k_head_stride = k_.stride(1);
  uint k_seq_stride = k_.stride(2);
  uint v_head_stride = v_.stride(1);
@ -294,8 +298,8 @@ static std::tuple<Tensor, Tensor> sdpa_vector_2pass_mps(const Tensor& q_,
                  maxs,
                  gqa_factor,
                  N,
-                  std::array<uint32_t, 2>{k_head_stride, k_seq_stride},
-                  std::array<uint32_t, 2>{v_head_stride, v_seq_stride},
+                  std::array<uint32_t, 3>{q_head_stride, k_head_stride, v_head_stride},
+                  std::array<uint32_t, 3>{q_seq_stride, k_seq_stride, v_seq_stride},
                  scale_factor);

      if (has_mask) {
--- a/aten/src/ATen/native/mps/operations/Unique.mm
+++ b/aten/src/ATen/native/mps/operations/Unique.mm
@ -9,11 +9,22 @@
 #else
 #include <ATen/ops/_unique2.h>
 #include <ATen/ops/_unique2_native.h>
+#include <ATen/ops/arange.h>
+#include <ATen/ops/argsort.h>
+#include <ATen/ops/cat.h>
+#include <ATen/ops/cumsum.h>
+#include <ATen/ops/full.h>
+#include <ATen/ops/masked_select.h>
+#include <ATen/ops/nonzero.h>
+#include <ATen/ops/ones.h>
+#include <ATen/ops/ones_like.h>
 #include <ATen/ops/slice.h>
 #include <ATen/ops/unique_consecutive.h>
 #include <ATen/ops/unique_consecutive_native.h>
 #include <ATen/ops/unique_dim_consecutive.h>
 #include <ATen/ops/unique_dim_consecutive_native.h>
+#include <ATen/ops/unique_dim_native.h>
+#include <ATen/ops/zeros.h>
 #endif

 namespace at::native {
@ -305,4 +316,85 @@ std::tuple<Tensor, Tensor, Tensor> _unique2_mps(const Tensor& self,
  return _unique_impl_mps(self, return_inverse, return_counts, false, std::nullopt);
 }

+static Tensor lexsort_rows_perm_mps(const Tensor& mat_2d) {
+  const auto rows = mat_2d.size(0), cols = mat_2d.size(1);
+  if (rows <= 1 || cols == 0) {
+    return arange(rows, mat_2d.options().dtype(kLong));
+  }
+
+  auto perm = arange(rows, mat_2d.options().dtype(kLong));
+  for (auto c = cols - 1; c >= 0; --c) {
+    auto keys = mat_2d.select(1, c).index_select(0, perm);
+    const auto idx = argsort(keys, /*dim=*/0, /*descending=*/false);
+    perm = perm.index_select(0, idx);
+  }
+  return perm;
+}
+
+static std::tuple<Tensor, Tensor, Tensor> unique_dim_sorted_mps_impl(const Tensor& self,
+                                                                     int64_t dim,
+                                                                     bool return_inverse,
+                                                                     bool return_counts) {
+  dim = maybe_wrap_dim(dim, self.dim());
+
+  auto sizes = self.sizes().vec();
+  auto num_zero_dims = std::count(sizes.begin(), sizes.end(), (int64_t)0);
+  if (self.size(dim) == 0) {
+    auto output = at::empty(sizes, self.options());
+    auto inverse_indices = at::empty({0}, self.options().dtype(kLong));
+    auto counts = at::empty({0}, self.options().dtype(kLong));
+    return {output, inverse_indices, counts};
+  }
+
+  auto transposed = self.moveaxis(dim, 0);
+  auto orig_sizes = transposed.sizes().vec();
+  auto rows = transposed.size(0);
+  auto input_flat = transposed.contiguous().view({rows, -1});
+
+  auto perm = lexsort_rows_perm_mps(input_flat);
+  auto input_sorted = input_flat.index_select(0, perm);
+
+  Tensor is_unique = at::zeros({rows}, self.options().dtype(kBool));
+  if (rows > 0) {
+    is_unique.narrow(0, 0, 1).fill_(true);
+  }
+  if (rows > 1) {
+    auto a = input_sorted.narrow(0, 1, rows - 1);
+    auto b = input_sorted.narrow(0, 0, rows - 1);
+    auto row_changed = a.ne(b).any(1);
+    is_unique.narrow(0, 1, rows - 1).copy_(row_changed);
+  }
+
+  auto unique_pos = nonzero(is_unique).squeeze(1);
+  auto group_id = cumsum(is_unique.to(kLong), 0).sub(1);
+
+  auto unique_rows_2d = input_sorted.index_select(0, unique_pos);
+
+  Tensor inverse_indices = empty({0}, self.options().dtype(kLong));
+  if (return_inverse) {
+    inverse_indices = empty({rows}, self.options().dtype(kLong));
+    inverse_indices.index_copy_(0, perm, group_id);
+  }
+
+  Tensor counts = empty({0}, self.options().dtype(kLong));
+  if (return_counts) {
+    const auto num_unique = unique_pos.size(0);
+    counts = zeros({num_unique}, self.options().dtype(kLong));
+    counts.scatter_add_(0, group_id, ones_like(group_id, group_id.options().dtype(kLong)));
+  }
+
+  orig_sizes[0] = unique_rows_2d.size(0);
+  auto output = unique_rows_2d.view(orig_sizes).moveaxis(0, dim);
+
+  return std::make_tuple(std::move(output), std::move(inverse_indices), std::move(counts));
+}
+
+std::tuple<Tensor, Tensor, Tensor> unique_dim_mps(const Tensor& self,
+                                                  int64_t dim,
+                                                  const bool /*sorted*/,
+                                                  const bool return_inverse,
+                                                  const bool return_counts) {
+  return unique_dim_sorted_mps_impl(self, dim, return_inverse, return_counts);
+}
+
 } // namespace at::native
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -1409,7 +1409,7 @@
 - func: _sparse_broadcast_to(Tensor(a) self, int[] size) -> Tensor(a)
  variants: function
  dispatch:
-    SparseCPU, SparseCUDA: sparse_broadcast_to
+    SparseCPU, SparseCUDA, SparseMPS: sparse_broadcast_to

 - func: cat(Tensor[] tensors, int dim=0) -> Tensor
  structured_delegate: cat.out
@ -6450,6 +6450,7 @@
  dispatch:
    CPU: unique_dim_cpu
    CUDA: unique_dim_cuda
+    MPS: unique_dim_mps
  tags: dynamic_output_shape
  autogen: unique_dim.out

--- a/aten/src/ATen/native/sparse/FlattenIndicesKernel.cpp
+++ b/aten/src/ATen/native/sparse/FlattenIndicesKernel.cpp
@ -27,6 +27,6 @@ REGISTER_AVX512_DISPATCH(flatten_indices_stub, &flatten_indices_cpu_kernel)
 REGISTER_AVX2_DISPATCH(flatten_indices_stub, &flatten_indices_cpu_kernel)
 REGISTER_VSX_DISPATCH(flatten_indices_stub, &flatten_indices_cpu_kernel)
 REGISTER_ZVECTOR_DISPATCH(flatten_indices_stub, &flatten_indices_cpu_kernel)
-REGISTER_SVE256_DISPATCH(flatten_indices_stub, &flatten_indices_cpu_kernel)
+REGISTER_SVE_DISPATCH(flatten_indices_stub, &flatten_indices_cpu_kernel)

 } // namespace at::native
--- a/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionKernel.cpp
+++ b/aten/src/ATen/native/sparse/SparseBinaryOpIntersectionKernel.cpp
@ -161,19 +161,19 @@ REGISTER_AVX512_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_
 REGISTER_AVX2_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_kernel)
 REGISTER_VSX_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_kernel)
 REGISTER_ZVECTOR_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_kernel)
-REGISTER_SVE256_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_kernel)
+REGISTER_SVE_DISPATCH(mul_sparse_sparse_out_stub, &mul_sparse_sparse_out_cpu_kernel)

 REGISTER_ARCH_DISPATCH(sparse_mask_intersection_out_stub, DEFAULT, &sparse_mask_intersection_out_cpu_kernel)
 REGISTER_AVX512_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel)
 REGISTER_AVX2_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel)
 REGISTER_VSX_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel)
 REGISTER_ZVECTOR_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel)
-REGISTER_SVE256_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel)
+REGISTER_SVE_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_cpu_kernel)

 REGISTER_ARCH_DISPATCH(sparse_mask_projection_out_stub, DEFAULT, &sparse_mask_projection_out_cpu_kernel)
 REGISTER_AVX512_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_cpu_kernel)
 REGISTER_AVX2_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_cpu_kernel)
 REGISTER_VSX_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_cpu_kernel)
 REGISTER_ZVECTOR_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_cpu_kernel)
-REGISTER_SVE256_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_cpu_kernel)
+REGISTER_SVE_DISPATCH(sparse_mask_projection_out_stub, &sparse_mask_projection_out_cpu_kernel)
 }
--- a/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredApply.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseSemiStructuredApply.cu
--- a/aten/src/ATen/native/transformers/attention.cpp
+++ b/aten/src/ATen/native/transformers/attention.cpp
@ -448,7 +448,7 @@ REGISTER_AVX2_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
 REGISTER_AVX512_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
 REGISTER_VSX_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
 REGISTER_ZVECTOR_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
-REGISTER_SVE256_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
+REGISTER_SVE_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_cpp)
 REGISTER_HPU_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_meta)

 int64_t _fused_sdp_choice_meta(
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
@ -688,6 +688,15 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
    TORCH_WARN(CUDNN_VERSION, " cuDNN version too old to use cuDNN Attention (< v9.0.0)");
  }
  return false;
+#endif
+#if defined(CUDNN_VERSION)
+  static auto cudnn_version = cudnnGetVersion();
+  if (params.dropout > 0.0 && cudnn_version > 91100 && cudnn_version < 91400) {
+    if (debug) {
+      TORCH_WARN(CUDNN_VERSION, " cuDNN version does not support droppout in SDPA (9.11 - 9.13).");
+    }
+    return false;
+  }
 #endif
  // Define gate functions that determine if a flash kernel can be ran
  // Replace with std::to_array when we migrate to c++20
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
@ -218,7 +218,7 @@ maml_omniglot,pass,0



-microbench_unbacked_tolist_sum,pass,1
+microbench_unbacked_tolist_sum,pass,2



--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
@ -146,7 +146,7 @@ maml_omniglot,pass,7



-microbench_unbacked_tolist_sum,pass,8
+microbench_unbacked_tolist_sum,pass,9



--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_amp_freezing_torchbench_inference.csv
@ -182,7 +182,7 @@ maml_omniglot,pass,0



-microbench_unbacked_tolist_sum,pass,0
+microbench_unbacked_tolist_sum,pass,1



--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_aot_inductor_freezing_torchbench_inference.csv
@ -182,7 +182,7 @@ maml_omniglot,pass,0



-microbench_unbacked_tolist_sum,pass,0
+microbench_unbacked_tolist_sum,pass,1



--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
@ -198,7 +198,7 @@ maml_omniglot,pass,0



-microbench_unbacked_tolist_sum,pass,1
+microbench_unbacked_tolist_sum,pass,2



--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
@ -198,7 +198,7 @@ maml_omniglot,pass,0



-microbench_unbacked_tolist_sum,pass,1
+microbench_unbacked_tolist_sum,pass,2



--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
@ -198,7 +198,7 @@ maml_omniglot,pass,0



-microbench_unbacked_tolist_sum,pass,1
+microbench_unbacked_tolist_sum,pass,2



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
@ -218,7 +218,7 @@ maml_omniglot,pass,0



-microbench_unbacked_tolist_sum,pass,1
+microbench_unbacked_tolist_sum,pass,2



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
@ -146,7 +146,7 @@ maml_omniglot,pass,7



-microbench_unbacked_tolist_sum,pass,8
+microbench_unbacked_tolist_sum,pass,9



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_amp_freezing_torchbench_inference.csv
@ -166,7 +166,7 @@ maml_omniglot,pass,0



-microbench_unbacked_tolist_sum,pass,0
+microbench_unbacked_tolist_sum,pass,1



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_aot_inductor_freezing_torchbench_inference.csv
@ -166,7 +166,7 @@ maml_omniglot,pass,0



-microbench_unbacked_tolist_sum,pass,0
+microbench_unbacked_tolist_sum,pass,1



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
@ -182,7 +182,7 @@ maml_omniglot,pass,0



-microbench_unbacked_tolist_sum,pass,1
+microbench_unbacked_tolist_sum,pass,2



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
@ -198,7 +198,7 @@ maml_omniglot,pass,0



-microbench_unbacked_tolist_sum,pass,1
+microbench_unbacked_tolist_sum,pass,2



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
@ -218,7 +218,7 @@ maml_omniglot,pass,0



-microbench_unbacked_tolist_sum,pass,1
+microbench_unbacked_tolist_sum,pass,2



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
@ -146,7 +146,7 @@ maml_omniglot,pass,7



-microbench_unbacked_tolist_sum,pass,8
+microbench_unbacked_tolist_sum,pass,9



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
@ -218,7 +218,7 @@ maml_omniglot,pass,0



-microbench_unbacked_tolist_sum,pass,1
+microbench_unbacked_tolist_sum,pass,2



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
@ -146,7 +146,7 @@ maml_omniglot,pass,7



-microbench_unbacked_tolist_sum,pass,8
+microbench_unbacked_tolist_sum,pass,9



--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
@ -218,7 +218,7 @@ maml_omniglot,pass,0



-microbench_unbacked_tolist_sum,pass,1
+microbench_unbacked_tolist_sum,pass,2



--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
@ -146,7 +146,7 @@ maml_omniglot,pass,7



-microbench_unbacked_tolist_sum,pass,8
+microbench_unbacked_tolist_sum,pass,9



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv
@ -221,7 +221,7 @@ maml_omniglot,pass,0



-microbench_unbacked_tolist_sum,pass,1
+microbench_unbacked_tolist_sum,pass,2



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_training.csv
@ -154,7 +154,7 @@ maml_omniglot,pass,7



-microbench_unbacked_tolist_sum,pass,8
+microbench_unbacked_tolist_sum,pass,9



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv
@ -221,7 +221,7 @@ maml_omniglot,pass,0



-microbench_unbacked_tolist_sum,pass,1
+microbench_unbacked_tolist_sum,pass,2



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_training.csv
@ -154,7 +154,7 @@ maml_omniglot,pass,7



-microbench_unbacked_tolist_sum,pass,8
+microbench_unbacked_tolist_sum,pass,9



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_training.csv
@ -142,7 +142,7 @@ maml_omniglot,pass,7



-microbench_unbacked_tolist_sum,pass,8
+microbench_unbacked_tolist_sum,pass,9



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv
@ -221,7 +221,7 @@ maml_omniglot,pass,0



-microbench_unbacked_tolist_sum,pass,1
+microbench_unbacked_tolist_sum,pass,2



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_training.csv
@ -154,7 +154,7 @@ maml_omniglot,pass,7



-microbench_unbacked_tolist_sum,pass,8
+microbench_unbacked_tolist_sum,pass,9



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_training.csv
@ -142,7 +142,7 @@ maml_omniglot,pass,7



-microbench_unbacked_tolist_sum,pass,8
+microbench_unbacked_tolist_sum,pass,9



--- a/benchmarks/dynamo/pr_time_benchmarks/README.md
+++ b/benchmarks/dynamo/pr_time_benchmarks/README.md
@ -6,4 +6,4 @@
 4. (Optional) flip a flag that you know will change the benchmark and run again with b.txt `PYTHONPATH=./ python benchmarks/[YOUR_BENCHMARK].py a.txt`
 5. Compare `a.txt` and `b.txt` located within the `benchmarks/dynamo/pr_time_benchmarks` folder to make sure things look as you expect
 6. Check in your new benchmark file and submit a new PR
-7. In a few days, if your benchmark is stable, bug Laith Sakka to enable running your benchmark on all PRs. If your a meta employee, you can find the dashboard here: internalfb.com/intern/unidash/dashboard/pt2_diff_time_metrics
+7. In a few days, if your benchmark is stable, bug Laith Sakka to enable running your benchmark on all PRs. If you are a meta employee, you can find the dashboard here: https://internalfb.com/intern/unidash/dashboard/pt2_diff_time_metrics
--- a/benchmarks/dynamo/pr_time_benchmarks/benchmarks/runtime_overhead.py
+++ b/benchmarks/dynamo/pr_time_benchmarks/benchmarks/runtime_overhead.py
@ -0,0 +1,111 @@
+import sys
+
+from benchmark_base import BenchmarkBase
+
+import torch
+from torch.autograd.grad_mode import inference_mode
+
+
+class Benchmark(BenchmarkBase):
+    def __init__(self, requires_grad, inference_mode, backward, dynamic):
+        assert not (inference_mode and backward), (
+            "inference_mode and backward cannot be both True"
+        )
+
+        self._requires_grad = requires_grad
+        self._inference_mode = inference_mode
+        self._backward = backward
+
+        super().__init__(
+            category="runtime_overhead",
+            backend="inductor",
+            device="cuda",
+            dynamic=dynamic,
+        )
+
+    def name(self):
+        prefix = f"{self.category()}_{self.backend()}"
+        if self._requires_grad:
+            prefix += "_requires_grad"
+        if self._inference_mode:
+            prefix += "_inference_mode"
+        if self._backward:
+            prefix += "_backward"
+        if self.is_dynamic():
+            prefix += "_dynamic"
+        return prefix
+
+    def description(self):
+        return "runtime of a compiled add1 op small input"
+
+    def _prepare_once(self):
+        torch._dynamo.reset()
+        self.a = torch.ones(2, device=self.device(), requires_grad=self._requires_grad)
+
+        @torch.compile(
+            backend=self.backend(),
+            fullgraph=True,
+            dynamic=self.is_dynamic(),
+        )
+        def add1(a):
+            return a + 1
+
+        self._add1 = add1
+
+        # warmup
+        for _ in range(10):
+            if self._backward:
+                self.forward_val = self._add1(self.a).sum()
+                self.forward_val.backward()
+            else:
+                self._work()
+
+    def _prepare(self):
+        if self._backward:
+            self.forward_val = self._add1(self.a).sum()
+
+    def _work(self):
+        if self._inference_mode:
+            with inference_mode():
+                self._add1(self.a)
+        elif self._backward:
+            self.forward_val.backward()
+        else:
+            self._add1(self.a)
+
+
+def main():
+    result_path = sys.argv[1]
+    all = [
+        Benchmark(
+            requires_grad=False, inference_mode=False, backward=False, dynamic=False
+        ),
+        Benchmark(
+            requires_grad=False, inference_mode=True, backward=False, dynamic=False
+        ),
+        Benchmark(
+            requires_grad=True, inference_mode=False, backward=False, dynamic=False
+        ),
+        Benchmark(
+            requires_grad=True, inference_mode=False, backward=True, dynamic=False
+        ),
+        Benchmark(
+            requires_grad=False, inference_mode=False, backward=False, dynamic=True
+        ),
+        Benchmark(
+            requires_grad=False, inference_mode=True, backward=False, dynamic=True
+        ),
+        Benchmark(
+            requires_grad=True, inference_mode=False, backward=False, dynamic=True
+        ),
+        Benchmark(
+            requires_grad=True, inference_mode=False, backward=True, dynamic=True
+        ),
+    ]
+
+    for benchmark in all:
+        benchmark.enable_instruction_count().collect_all().append_results(result_path)
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/operator_benchmark/README.md
+++ b/benchmarks/operator_benchmark/README.md
@ -20,7 +20,7 @@ Key Features:
 The instruction below installs a cpp\_extension for PyTorch and it is required to run the benchmark suite.
 ```bash
 cd pt_extension
-python -m pip install .
+python -m pip install . -v --no-build-isolation
 ```

 ## How to run the benchmarks:
--- a/benchmarks/operator_benchmark/benchmark_core.py
+++ b/benchmarks/operator_benchmark/benchmark_core.py
@ -18,6 +18,7 @@ import torch

 # needs to be imported after torch
 import torch.utils.cpp_extension as cpp_extension  # noqa: F401
+from torch.utils.benchmark import Timer


 """Performance microbenchmarks.
@ -348,10 +349,24 @@ class BenchmarkRunner:
            func = test_case.run_jit_forward
        if self.use_compile:
            func = test_case.run_compile_forward
-        forward_time = timeit.timeit(
-            functools.partial(func, iters, print_per_iter, cuda_sync), number=1
+
+        if not cuda_sync:
+            forward_time = timeit.timeit(
+                functools.partial(func, iters, print_per_iter, cuda_sync), number=1
+            )
+            return forward_time
+        # Stable timing with Timer
+        timer = Timer(
+            stmt="func(iters, print_per_iter, cuda_sync)",
+            globals={
+                "func": func,
+                "iters": iters,
+                "print_per_iter": print_per_iter,
+                "cuda_sync": cuda_sync,
+            },
        )
-        return forward_time
+        result = timer.adaptive_autorange(min_run_time=0.0001)
+        return result.median * iters

    def _launch_backward(self, test_case, iters, print_per_iter=False):
        """This function runs forward path of an op to get an output. Then the backward path is executed
--- a/benchmarks/operator_benchmark/benchmark_pytorch.py
+++ b/benchmarks/operator_benchmark/benchmark_pytorch.py
@ -161,6 +161,8 @@ class PyTorchOperatorTestCase:
        if self._compile_forward_graph is None:
            self._compile_forward_graph = self._generate_compile_forward_graph()
        self._compile_forward_graph(num_runs)
+        if cuda_sync:
+            torch.cuda.synchronize(torch.cuda.current_device())

    def _print_per_iter(self):
        # print last 50 values
--- a/benchmarks/operator_benchmark/pt/add_test.py
+++ b/benchmarks/operator_benchmark/pt/add_test.py
@ -52,27 +52,6 @@ class AddBenchmark(op_bench.TorchBenchmarkBase):
 op_bench.generate_pt_test(add_long_configs + add_short_configs, AddBenchmark)
 op_bench.generate_pt_gradient_test(add_long_configs + add_short_configs, AddBenchmark)

-
-"""Mircobenchmark for addmm operator."""
-
-
-class AddmmBenchmark(op_bench.TorchBenchmarkBase):
-    def init(self, M, N, K, device):
-        self.inputs = {
-            "input_one": torch.rand(M, K, device=device, requires_grad=self.auto_set()),
-            "mat1": torch.rand(M, N, device=device, requires_grad=self.auto_set()),
-            "mat2": torch.rand(N, K, device=device, requires_grad=self.auto_set()),
-        }
-        self.set_module_name("addmm")
-
-    def forward(self, input_one, mat1, mat2):
-        return torch.addmm(input_one, mat1, mat2)
-
-
-op_bench.generate_pt_test(add_long_configs + add_short_configs, AddmmBenchmark)
-op_bench.generate_pt_gradient_test(add_long_configs + add_short_configs, AddmmBenchmark)
-
-
 """Mircobenchmark for addr operator."""


@ -106,46 +85,5 @@ addr_configs = op_bench.cross_product_configs(
 op_bench.generate_pt_test(addr_configs, AddrBenchmark)
 op_bench.generate_pt_gradient_test(addr_configs, AddrBenchmark)

-
-"""Mircobenchmark for addbmm operator."""
-
-
-class AddbmmBenchmark(op_bench.TorchBenchmarkBase):
-    def init(self, B, M, N, K, device):
-        self.inputs = {
-            "input_one": torch.rand(
-                (M, N), device=device, requires_grad=self.auto_set()
-            ),
-            "batch1": torch.rand(
-                (B, M, K), device=device, requires_grad=self.auto_set()
-            ),
-            "batch2": torch.rand(
-                (
-                    B,
-                    K,
-                    N,
-                ),
-                device=device,
-                requires_grad=self.auto_set(),
-            ),
-        }
-        self.set_module_name("addbmm")
-
-    def forward(self, input_one, batch1, batch2):
-        return torch.addbmm(input_one, batch1, batch2)
-
-
-addbmm_configs = op_bench.cross_product_configs(
-    B=[2, 100],
-    M=[8, 256],
-    N=[256, 16],
-    K=[15, 16],
-    device=["cpu", "cuda"],
-    tags=["addbmm"],
-)
-
-op_bench.generate_pt_test(addbmm_configs, AddbmmBenchmark)
-op_bench.generate_pt_gradient_test(addbmm_configs, AddbmmBenchmark)
-
 if __name__ == "__main__":
    op_bench.benchmark_runner.main()
--- a/Show More
+++ b/Show More