clone from 6640eda

2025-10-26 08:34:52 +08:00 · 2025-03-12 17:21:50 -07:00
394 changed files with 5078 additions and 13158 deletions
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -105,6 +105,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -118,6 +119,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -132,6 +134,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -146,6 +149,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.13
    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -160,6 +164,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -173,6 +178,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -187,6 +193,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -201,6 +208,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.13
    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -215,6 +223,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -226,6 +235,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    CLANG_VERSION=10
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    CONDA_CMAKE=yes
    ONNX=yes
@ -234,6 +244,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    CLANG_VERSION=10
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    VULKAN_SDK_VERSION=1.2.162.1
    SWIFTSHADER=yes
@ -244,6 +255,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.11
    CLANG_VERSION=10
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    VULKAN_SDK_VERSION=1.2.162.1
    SWIFTSHADER=yes
@ -254,6 +266,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    CONDA_CMAKE=yes
    TRITON=yes
@ -262,6 +275,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    ROCM_VERSION=6.2.4
    NINJA_VERSION=1.9.0
@ -276,6 +290,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    ROCM_VERSION=6.3
    NINJA_VERSION=1.9.0
@ -290,6 +305,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    XPU_VERSION=0.5
    NINJA_VERSION=1.9.0
@ -300,6 +316,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    XPU_VERSION=2025.0
    NINJA_VERSION=1.9.0
@ -310,6 +327,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    CONDA_CMAKE=yes
@ -323,6 +341,7 @@ case "$image" in
    CUDNN_VERSION=9
    CLANG_VERSION=12
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    TRITON=yes
    ;;
@ -330,6 +349,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    CLANG_VERSION=12
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    CONDA_CMAKE=yes
    TRITON=yes
@ -350,6 +370,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    CONDA_CMAKE=yes
@ -395,6 +416,7 @@ case "$image" in
    GCC_VERSION=11
    ACL=yes
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    CONDA_CMAKE=yes
    # snadampal: skipping llvm src build install because the current version
@ -406,6 +428,7 @@ case "$image" in
    GCC_VERSION=11
    ACL=yes
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    CONDA_CMAKE=yes
    # snadampal: skipping llvm src build install because the current version
@ -416,6 +439,7 @@ case "$image" in
  *)
    # Catch-all for builds that are not hardcoded.
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    echo "image '$image' did not match an existing build configuration"
    if [[ "$image" == *py* ]]; then
@ -471,6 +495,7 @@ docker build \
       --build-arg "BUILD_ENVIRONMENT=${image}" \
       --build-arg "PROTOBUF=${PROTOBUF:-}" \
       --build-arg "LLVMDEV=${LLVMDEV:-}" \
+       --build-arg "DB=${DB:-}" \
       --build-arg "VISION=${VISION:-}" \
       --build-arg "UBUNTU_VERSION=${UBUNTU_VERSION}" \
       --build-arg "CENTOS_VERSION=${CENTOS_VERSION}" \
--- a/.ci/docker/centos-rocm/Dockerfile
+++ b/.ci/docker/centos-rocm/Dockerfile
@ -55,6 +55,13 @@ RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
 RUN rm install_protobuf.sh
 ENV INSTALLED_PROTOBUF ${PROTOBUF}

+# (optional) Install database packages like LMDB and LevelDB
+ARG DB
+COPY ./common/install_db.sh install_db.sh
+RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
+RUN rm install_db.sh
+ENV INSTALLED_DB ${DB}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
--- a/.ci/docker/ci_commit_pins/nccl-cu12.txt
+++ b/.ci/docker/ci_commit_pins/nccl-cu12.txt
@ -1 +1 @@
-v2.26.2-1
+v2.25.1-1
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -240,7 +240,7 @@ function prune_126 {
 }

 function install_128 {
-  CUDNN_VERSION=9.8.0.87
+  CUDNN_VERSION=9.7.1.26
  echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
  rm -rf /usr/local/cuda-12.8 /usr/local/cuda
  # install CUDA 12.8.0 in the same container
--- a/.ci/docker/common/install_cuda_aarch64.sh
+++ b/.ci/docker/common/install_cuda_aarch64.sh
@ -161,7 +161,7 @@ function prune_126 {
 }

 function install_128 {
-  CUDNN_VERSION=9.8.0.87
+  CUDNN_VERSION=9.7.1.26
  echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
  rm -rf /usr/local/cuda-12.8 /usr/local/cuda
  # install CUDA 12.8.0 in the same container
--- a/.ci/docker/common/install_cudnn.sh
+++ b/.ci/docker/common/install_cudnn.sh
@ -5,7 +5,7 @@ if [[ -n "${CUDNN_VERSION}" ]]; then
    mkdir tmp_cudnn
    pushd tmp_cudnn
    if [[ ${CUDA_VERSION:0:4} == "12.8" ]]; then
-        CUDNN_NAME="cudnn-linux-x86_64-9.8.0.87_cuda12-archive"
+        CUDNN_NAME="cudnn-linux-x86_64-9.7.1.26_cuda12-archive"
    elif [[ ${CUDA_VERSION:0:4} == "12.6" ]]; then
        CUDNN_NAME="cudnn-linux-x86_64-9.5.1.17_cuda12-archive"
    elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then
--- a/.ci/docker/common/install_db.sh
+++ b/.ci/docker/common/install_db.sh
@ -0,0 +1,38 @@
+#!/bin/bash
+
+set -ex
+
+install_ubuntu() {
+  apt-get update
+
+  # Cleanup
+  apt-get autoclean && apt-get clean
+  rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+}
+
+install_centos() {
+  # Need EPEL for many packages we depend on.
+  # See http://fedoraproject.org/wiki/EPEL
+  yum --enablerepo=extras install -y epel-release
+
+  # Cleanup
+  yum clean all
+  rm -rf /var/cache/yum
+  rm -rf /var/lib/yum/yumdb
+  rm -rf /var/lib/yum/history
+}
+
+# Install base packages depending on the base OS
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+case "$ID" in
+  ubuntu)
+    install_ubuntu
+    ;;
+  centos)
+    install_centos
+    ;;
+  *)
+    echo "Unable to determine OS..."
+    exit 1
+    ;;
+esac
--- a/.ci/docker/common/install_rocm_drm.sh
+++ b/.ci/docker/common/install_rocm_drm.sh
@ -25,9 +25,7 @@ python3 -m pip install meson ninja
 ###########################
 ### clone repo
 ###########################
-# TEMPORARY FIX: https://gitlab.freedesktop.org/mesa/drm.git is down until 2025/03/22
-# GIT_SSL_NO_VERIFY=true git clone https://gitlab.freedesktop.org/mesa/drm.git
-GIT_SSL_NO_VERIFY=true git clone git://anongit.freedesktop.org/mesa/drm
+GIT_SSL_NO_VERIFY=true git clone https://gitlab.freedesktop.org/mesa/drm.git
 pushd drm

 ###########################
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -41,14 +41,11 @@ fbscribelogger==0.1.7
 #Pinned versions: 0.1.6
 #test that import:

-flatbuffers==2.0 ; platform_machine != "s390x"
+flatbuffers==2.0
 #Description: cross platform serialization library
 #Pinned versions: 2.0
 #test that import:

-flatbuffers ; platform_machine == "s390x"
-#Description: cross platform serialization library; Newer version is required on s390x for new python version
-
 hypothesis==5.35.1
 # Pin hypothesis to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136
 #Description: advanced library for generating parametrized tests
@ -105,10 +102,10 @@ networkx==2.8.8
 #Pinned versions: 2.8.8
 #test that import: functorch

-ninja==1.11.1.3
-#Description: build system. Used in some tests. Used in build to generate build
-#time tracing information
-#Pinned versions: 1.11.1.3
+#ninja
+#Description: build system.  Note that it install from
+#here breaks things so it is commented out
+#Pinned versions: 1.10.0.post1
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py

 numba==0.49.0 ; python_version < "3.9"
@ -368,6 +365,7 @@ PyYAML
 pyzstd
 setuptools

+ninja==1.11.1 ; platform_machine == "aarch64"
 scons==4.5.2 ; platform_machine == "aarch64"

 pulp==2.9.0 ; python_version >= "3.8"
--- a/.ci/docker/ubuntu-cuda/Dockerfile
+++ b/.ci/docker/ubuntu-cuda/Dockerfile
@ -50,6 +50,13 @@ RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
 RUN rm install_protobuf.sh
 ENV INSTALLED_PROTOBUF ${PROTOBUF}

+# (optional) Install database packages like LMDB and LevelDB
+ARG DB
+COPY ./common/install_db.sh install_db.sh
+RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
+RUN rm install_db.sh
+ENV INSTALLED_DB ${DB}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -50,6 +50,13 @@ RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
 RUN rm install_protobuf.sh
 ENV INSTALLED_PROTOBUF ${PROTOBUF}

+# (optional) Install database packages like LMDB and LevelDB
+ARG DB
+COPY ./common/install_db.sh install_db.sh
+RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
+RUN rm install_db.sh
+ENV INSTALLED_DB ${DB}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@ -77,6 +77,13 @@ COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton-xpu.txt triton_version.txt

+# (optional) Install database packages like LMDB and LevelDB
+ARG DB
+COPY ./common/install_db.sh install_db.sh
+RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
+RUN rm install_db.sh
+ENV INSTALLED_DB ${DB}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -74,6 +74,13 @@ RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
 RUN rm install_protobuf.sh
 ENV INSTALLED_PROTOBUF ${PROTOBUF}

+# (optional) Install database packages like LMDB and LevelDB
+ARG DB
+COPY ./common/install_db.sh install_db.sh
+RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
+RUN rm install_db.sh
+ENV INSTALLED_DB ${DB}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
--- a/.ci/pytorch/check_binary.sh
+++ b/.ci/pytorch/check_binary.sh
@ -73,14 +73,26 @@ fi
 # Check GCC ABI
 ###############################################################################

-# NOTE: As of https://github.com/pytorch/pytorch/issues/126551 we only produce
-#       wheels with cxx11-abi
+# NOTE [ Building libtorch with old vs. new gcc ABI ]
+#
+# Packages built with one version of ABI could not be linked against by client
+# C++ libraries that were compiled using the other version of ABI. Since both
+# gcc ABIs are still common in the wild, we need to support both ABIs. Currently:
+#
+# - All the nightlies built on CentOS 7 + devtoolset7 use the old gcc ABI.
+# - All the nightlies built on Ubuntu 16.04 + gcc 5.4 use the new gcc ABI.

 echo "Checking that the gcc ABI is what we expect"
 if [[ "$(uname)" != 'Darwin' ]]; then
  function is_expected() {
-    if [[ "$1" -gt 0 || "$1" == "ON " ]]; then
-      echo 1
+    if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* || "$DESIRED_CUDA" == *"rocm"* ]]; then
+      if [[ "$1" -gt 0 || "$1" == "ON " ]]; then
+        echo 1
+      fi
+    else
+      if [[ -z "$1" || "$1" == 0 || "$1" == "OFF" ]]; then
+        echo 1
+      fi
    fi
  }

--- a/.ci/pytorch/smoke_test/check_binary_symbols.py
+++ b/.ci/pytorch/smoke_test/check_binary_symbols.py
@ -121,9 +121,9 @@ def main() -> None:
        else:
            install_root = Path(distutils.sysconfig.get_python_lib()) / "torch"

-    libtorch_cpu_path = str(install_root / "lib" / "libtorch_cpu.so")
-    # NOTE: All binaries are built with cxx11abi now
-    check_lib_symbols_for_abi_correctness(libtorch_cpu_path, False)
+    libtorch_cpu_path = install_root / "lib" / "libtorch_cpu.so"
+    pre_cxx11_abi = "cxx11-abi" not in os.getenv("DESIRED_DEVTOOLSET", "")
+    check_lib_symbols_for_abi_correctness(libtorch_cpu_path, pre_cxx11_abi)


 if __name__ == "__main__":
--- a/.ci/pytorch/smoke_test/smoke_test.py
+++ b/.ci/pytorch/smoke_test/smoke_test.py
@ -76,13 +76,10 @@ def read_release_matrix():


 def test_numpy():
-    try:
-        import numpy as np
+    import numpy as np

-        x = np.arange(5)
-        torch.tensor(x)
-    except ImportError:
-        print("Numpy check skipped. Numpy is not installed.")
+    x = np.arange(5)
+    torch.tensor(x)


 def check_version(package: str) -> None:
@ -413,7 +410,6 @@ def main() -> None:
    smoke_test_conv2d()
    test_linalg()
    test_numpy()
-
    if is_cuda_system:
        test_linalg("cuda")
        test_cuda_gds_errors_captured()
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -1619,7 +1619,6 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
  install_torchvision
  checkout_install_torchbench hf_T5 llama moco
  PYTHONPATH=$(pwd)/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
-  test_inductor_aoti
 elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
  install_torchvision
  test_inductor_shard "${SHARD_NUMBER}"
--- a/.circleci/scripts/binary_upload.sh
+++ b/.circleci/scripts/binary_upload.sh
@ -55,16 +55,12 @@ s3_upload() {
    s3_upload_dir="${s3_root_dir}/${UPLOAD_SUBFOLDER}/"
  fi
  (
-    cache_control_flag=""
-    if [[ "${UPLOAD_CHANNEL}" = "test" ]]; then
-      cache_control_flag="--cache-control='no-cache,no-store,must-revalidate'"
-    fi
    for pkg in ${PKG_DIR}/*.${extension}; do
      (
        set -x
        shm_id=$(sha256sum "${pkg}" | awk '{print $1}')
        ${AWS_S3_CP} --no-progress --acl public-read "${pkg}" "${s3_upload_dir}" \
-          --metadata "checksum-sha256=${shm_id}" ${cache_control_flag}
+          --metadata "checksum-sha256=${shm_id}"
      )
    done
  )
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -3,11 +3,8 @@ self-hosted-runner:
    # GitHub hosted runner that actionlint doesn't recognize because actionlint version (1.6.21) is too old
    - ubuntu-24.04
    # GitHub hosted x86 Linux runners
-    # TODO: Cleanup mentions of linux.20_04 when upgrade to linux.24_04 is complete
    - linux.20_04.4x
    - linux.20_04.16x
-    - linux.24_04.4x
-    - linux.24_04.16x
    # Organization-wide AWS Linux Runners
    - linux.large
    - linux.2xlarge
@ -52,7 +49,6 @@ self-hosted-runner:
    - linux.rocm.gpu
    - linux.rocm.gpu.2
    - linux.rocm.gpu.4
-    - rocm-docker
    # Repo-specific Apple hosted  runners
    - macos-m1-ultra
    - macos-m2-14
--- a/.github/actions/checkout-pytorch/action.yml
+++ b/.github/actions/checkout-pytorch/action.yml
@ -23,44 +23,9 @@ runs:
      id: check_container_runner
      run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"

-    - name: Set up parallel fetch and clean workspace
-      id: first-clean
-      continue-on-error: true
+    - name: Clean workspace
      shell: bash
      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
-      env:
-        NO_SUDO: ${{ inputs.no-sudo }}
-      run: |
-        # Use all available CPUs for fetching
-        cd "${GITHUB_WORKSPACE}"
-        git config --global fetch.parallel 0
-        git config --global submodule.fetchJobs 0
-
-        # Clean workspace. The default checkout action should also do this, but
-        # do it here as well just in case
-        if [[ -d .git ]]; then
-          if [ -z "${NO_SUDO}" ]; then
-            sudo git clean -ffdx
-          else
-            git clean -ffdx
-          fi
-        fi
-
-    - name: Checkout PyTorch
-      id: first-checkout-attempt
-      continue-on-error: true
-      uses: actions/checkout@v4
-      with:
-        ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-        # --depth=1 for speed, manually fetch history and other refs as necessary
-        fetch-depth: ${{ inputs.fetch-depth }}
-        submodules: ${{ inputs.submodules }}
-        show-progress: false
-
-    - name: Clean workspace (try again)
-      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' &&
-        (steps.first-clean.outcome != 'success' || steps.first-checkout-attempt.outcome != 'success') }}
-      shell: bash
      env:
        NO_SUDO: ${{ inputs.no-sudo }}
      run: |
@ -75,11 +40,16 @@ runs:
        fi
        mkdir "${GITHUB_WORKSPACE}"

-    - name: Checkout PyTorch (try again)
+        # Use all available CPUs for fetching
+        cd "${GITHUB_WORKSPACE}"
+        git config --global fetch.parallel 0
+        git config --global submodule.fetchJobs 0
+
+    - name: Checkout PyTorch
      uses: actions/checkout@v4
-      if: ${{ steps.first-clean.outcome != 'success' || steps.first-checkout-attempt.outcome != 'success' }}
      with:
        ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+        # --depth=1 for speed, manually fetch history and other refs as necessary
        fetch-depth: ${{ inputs.fetch-depth }}
        submodules: ${{ inputs.submodules }}
        show-progress: false
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -68,7 +68,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'"
@ -77,14 +77,14 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'"
--- a/.github/scripts/s390x-ci/tests_list.py
+++ b/.github/scripts/s390x-ci/tests_list.py
@ -1,97 +0,0 @@
-#!/usr/bin/env python3
-
-import os
-import re
-import sys
-
-
-sys.path.insert(1, os.path.join(sys.path[0], "..", "..", ".."))
-
-from tools.testing.discover_tests import TESTS
-
-
-skip_list = [
-    # these tests fail due to various reasons
-    "dynamo/test_misc",
-    "inductor/test_aot_inductor",
-    "inductor/test_cpu_repro",
-    "inductor/test_cpu_select_algorithm",
-    "inductor/test_aot_inductor_arrayref",
-    "inductor/test_torchinductor_codegen_dynamic_shapes",
-    "lazy/test_meta_kernel",
-    "onnx/test_utility_funs",
-    "profiler/test_profiler",
-    "test_ao_sparsity",
-    "test_cpp_extensions_open_device_registration",
-    "test_jit",
-    "test_metal",
-    "test_mps",
-    "dynamo/test_torchrec",
-    "inductor/test_aot_inductor_utils",
-    "inductor/test_coordinate_descent_tuner",
-    "test_jiterator",
-    # these tests run long and fail in addition to that
-    "dynamo/test_dynamic_shapes",
-    "test_quantization",
-    "inductor/test_torchinductor",
-    "inductor/test_torchinductor_dynamic_shapes",
-    "inductor/test_torchinductor_opinfo",
-    "test_binary_ufuncs",
-    "test_unary_ufuncs",
-    # these tests fail when cuda is not available
-    "inductor/test_cudacodecache",
-    "inductor/test_inductor_utils",
-    "inductor/test_inplacing_pass",
-    "inductor/test_kernel_benchmark",
-    "inductor/test_max_autotune",
-    "inductor/test_move_constructors_to_cuda",
-    "inductor/test_multi_kernel",
-    "inductor/test_pattern_matcher",
-    "inductor/test_perf",
-    "inductor/test_select_algorithm",
-    "inductor/test_snode_runtime",
-    "inductor/test_triton_wrapper",
-    # these tests fail when mkldnn is not available
-    "inductor/test_custom_post_grad_passes",
-    "inductor/test_mkldnn_pattern_matcher",
-    # lacks quantization support
-    "onnx/test_models_quantized_onnxruntime",
-    "onnx/test_pytorch_onnx_onnxruntime",
-    # https://github.com/pytorch/pytorch/issues/102078
-    "test_decomp",
-    # https://github.com/pytorch/pytorch/issues/146698
-    "test_model_exports_to_core_aten",
-    # runs very long, skip for now
-    "inductor/test_layout_optim",
-    "test_fx",
-    # some false errors
-    "doctests",
-]
-
-skip_list_regex = [
-    # distributed tests fail randomly
-    "distributed/.*",
-]
-
-all_testfiles = sorted(TESTS)
-
-filtered_testfiles = []
-
-for filename in all_testfiles:
-    if filename in skip_list:
-        continue
-
-    regex_filtered = False
-
-    for regex_string in skip_list_regex:
-        if re.fullmatch(regex_string, filename):
-            regex_filtered = True
-            break
-
-    if regex_filtered:
-        continue
-
-    filtered_testfiles.append(filename)
-
-for filename in filtered_testfiles:
-    print('    "' + filename + '",')
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -819,9 +819,10 @@ class GitHubPR:
                    cursor=info["reviews"]["pageInfo"]["startCursor"],
                )
                info = rc["data"]["repository"]["pullRequest"]
-        reviews = {
-            author: state for author, state in self._reviews if state != "COMMENTED"
-        }
+        reviews = {}
+        for author, state in self._reviews:
+            if state != "COMMENTED":
+                reviews[author] = state
        return list(reviews.items())

    def get_approved_by(self) -> list[str]:
@ -2281,8 +2282,7 @@ def merge(
        except MandatoryChecksMissingError as ex:
            last_exception = str(ex)
            print(
-                f"Merge of https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num} failed due to: {ex}. Retrying in 5 min",
-                flush=True,
+                f"Merge of https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num} failed due to: {ex}. Retrying in 5 min"
            )
            time.sleep(5 * 60)
    # Finally report timeout back
--- a/.github/workflows/_mac-build.yml
+++ b/.github/workflows/_mac-build.yml
@ -33,6 +33,10 @@ on:
        default: "3.9"
        description: |
          The python version to be used. Will be 3.9 by default
+      environment-file:
+        required: false
+        type: string
+        description: Set the conda environment file used to setup macOS build.
      test-matrix:
        required: false
        type: string
@ -82,12 +86,23 @@ jobs:
          fi

      - name: Setup miniconda
+        if: inputs.environment-file == ''
        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
        with:
          python-version: ${{ inputs.python-version }}
          environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
          pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt

+      # This option is used when cross-compiling arm64 from x86-64. Specifically, we need arm64 conda
+      # environment even though the arch is x86-64
+      - name: Setup miniconda using the provided environment file
+        if: inputs.environment-file != ''
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
+        with:
+          python-version: ${{ inputs.python-version }}
+          environment-file: ${{ inputs.environment-file }}
+          pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt
+
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
        uses: nick-fields/retry@v3.0.0
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
--- a/.github/workflows/check-labels.yml
+++ b/.github/workflows/check-labels.yml
@ -35,7 +35,7 @@ jobs:
      pull-requests: write
    name: Check labels
    if: github.repository_owner == 'pytorch'
-    runs-on: linux.24_04.4x
+    runs-on: linux.20_04.4x
    steps:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
--- a/.github/workflows/docker-cache-mi300.yml
+++ b/.github/workflows/docker-cache-mi300.yml
@ -1,55 +0,0 @@
-name: docker-cache-mi300
-
-on:
-  # run every 6 hours
-  schedule:
-    - cron: 0 0,6,12,18 * * *
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }}
-  cancel-in-progress: true
-
-permissions:
-  id-token: write
-  contents: read
-
-jobs:
-  docker-cache:
-    if: github.repository_owner == 'pytorch'
-    runs-on: rocm-docker
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-        with:
-          no-sudo: true
-
-      - name: configure aws credentials
-        id: aws_creds
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-          aws-region: us-east-1
-          role-duration-seconds: 18000
-
-      - name: Login to Amazon ECR
-        id: login-ecr
-        continue-on-error: false
-        uses: aws-actions/amazon-ecr-login@v2
-
-      - name: Calculate docker image
-        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-          docker-image-name: pytorch-linux-focal-rocm-n-py3
-          push: false
-
-      - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
-
-      - name: Tar and upload to S3 bucket
-        run: |
-          sudo docker save -o ~/docker-data/pytorch/pytorch_docker_image.tar ${{ steps.calculate-docker-image.outputs.docker-image }}
-          sudo rclone copy -P --s3-upload-concurrency 64 --s3-chunk-size 200M --s3-upload-cutoff 300M ~/docker-data/pytorch/pytorch_docker_image.tar oci:pytorchbucket0002/pytorch_docker_image --progress
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -64,7 +64,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_9-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cpu-aarch64-test:  # Testing
@ -134,7 +134,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_9-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -181,7 +181,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cpu-aarch64-test:  # Testing
@ -251,7 +251,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -298,7 +298,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cpu-aarch64-test:  # Testing
@ -368,7 +368,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -415,7 +415,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cpu-aarch64-test:  # Testing
@ -485,7 +485,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -532,7 +532,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cpu-aarch64-test:  # Testing
@ -602,7 +602,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -649,7 +649,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cpu-aarch64-test:  # Testing
@ -719,7 +719,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -105,7 +105,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_6-test:  # Testing
@ -152,7 +152,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_8-test:  # Testing
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -262,7 +262,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_6-test:  # Testing
@ -331,7 +331,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_8-test:  # Testing
@ -891,7 +891,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_6-test:  # Testing
@ -960,7 +960,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_8-test:  # Testing
@ -1520,7 +1520,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_6-test:  # Testing
@ -1654,7 +1654,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_8-test:  # Testing
@ -2214,7 +2214,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_6-test:  # Testing
@ -2283,7 +2283,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_8-test:  # Testing
@ -2843,7 +2843,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_6-test:  # Testing
@ -2912,7 +2912,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_8-test:  # Testing
@ -3472,7 +3472,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_6-test:  # Testing
@ -3541,7 +3541,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.2.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.7.53; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.55; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.0.11; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_8-test:  # Testing
--- a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
@ -63,7 +63,7 @@ jobs:
      timeout-minutes: 420
      build_name: manywheel-py3_9-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cpu-s390x-test:  # Testing
@ -128,7 +128,7 @@ jobs:
      timeout-minutes: 420
      build_name: manywheel-py3_10-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cpu-s390x-test:  # Testing
@ -193,7 +193,7 @@ jobs:
      timeout-minutes: 420
      build_name: manywheel-py3_11-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cpu-s390x-test:  # Testing
@ -258,7 +258,7 @@ jobs:
      timeout-minutes: 420
      build_name: manywheel-py3_12-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cpu-s390x-test:  # Testing
@ -323,7 +323,7 @@ jobs:
      timeout-minutes: 420
      build_name: manywheel-py3_13-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cpu-s390x-test:  # Testing
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@ -43,7 +43,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -167,7 +167,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -291,7 +291,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -415,7 +415,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -539,7 +539,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.13"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
@ -663,7 +663,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.13t"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
--- a/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
@ -54,7 +54,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
@ -54,7 +54,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -290,7 +290,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -528,7 +528,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -766,7 +766,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -1238,7 +1238,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -1474,7 +1474,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -1712,7 +1712,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -1950,7 +1950,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -2422,7 +2422,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -2658,7 +2658,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -2896,7 +2896,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -3134,7 +3134,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -3606,7 +3606,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -3842,7 +3842,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -4080,7 +4080,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -4318,7 +4318,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -4790,7 +4790,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.13"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -5026,7 +5026,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.13"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -5264,7 +5264,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.13"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -5502,7 +5502,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.13"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -5974,7 +5974,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.13t"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -6210,7 +6210,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.13t"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -6448,7 +6448,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.13t"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -6686,7 +6686,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.13t"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.25.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -26,7 +26,7 @@ jobs:
      curr_branch: ${{ github.head_ref || github.ref_name }}

  lintrunner-clang:
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
    needs: get-label-type
    with:
      timeout: 120
@ -43,7 +43,7 @@ jobs:
        .github/scripts/lintrunner.sh

  lintrunner-noclang:
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
    needs: get-label-type
    with:
      timeout: 120
@ -59,7 +59,7 @@ jobs:
        .github/scripts/lintrunner.sh

  quick-checks:
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
    needs: get-label-type
    with:
      timeout: 120
@ -116,7 +116,7 @@ jobs:
          bash .github/scripts/pr-sanity-check.sh

  workflow-checks:
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
    needs: get-label-type
    with:
      timeout: 120
@ -154,7 +154,7 @@ jobs:
        exit $RC

  toc:
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
    needs: get-label-type
    with:
      timeout: 120
@ -194,7 +194,7 @@ jobs:
  test-tools:
    name: Test tools
    if: ${{ github.repository == 'pytorch/pytorch' }}
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
    needs: get-label-type
    with:
      timeout: 120
@ -215,7 +215,7 @@ jobs:
  test_run_test:
    name: Test `run_test.py` is usable without boto3
    if: ${{ github.repository == 'pytorch/pytorch' }}
-    runs-on: linux.24_04.4x
+    runs-on: linux.20_04.4x
    steps:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
@ -241,18 +241,10 @@ jobs:
  test_collect_env:
    if: ${{ github.repository == 'pytorch/pytorch' }}
    name: Test collect_env
-    runs-on: ${{ matrix.runner }}
+    runs-on: linux.20_04.4x
    strategy:
      matrix:
-        include:
-          - test_type: with_torch
-            runner: linux.24_04.4x
-          - test_type: without_torch
-            runner: linux.24_04.4x
-          # NOTE: The oldest supported version of python for 24.04 is 3.8
-          #       so this cannot be updated if we want to keep this test at 3.6
-          - test_type: older_python_version
-            runner: linux.20_04.4x
+        test_type: [with_torch, without_torch, older_python_version]
    steps:
      # [see note: pytorch repo ref]
      # deep clone (fetch-depth 0) required, to allow us to use git log
--- a/.github/workflows/revert.yml
+++ b/.github/workflows/revert.yml
@ -7,7 +7,7 @@ on:
 jobs:
  do_revert:
    name: try_revert_pr_${{ github.event.client_payload.pr_num }}
-    runs-on: linux.24_04.4x
+    runs-on: linux.20_04.4x
    environment: mergebot
    env:
        GH_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
--- a/.github/workflows/test-check-binary.yml
+++ b/.github/workflows/test-check-binary.yml
@ -15,7 +15,7 @@ jobs:
  check_binary_linux_cpu:
    if: github.repository_owner == 'pytorch'
    name: Test check_binary.sh for Linux CPU
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
    with:
      docker-image: python:3.11
      docker-build-dir: "skip-docker-build"
@ -28,7 +28,7 @@ jobs:
  check_binary_linux_cuda:
    if: github.repository_owner == 'pytorch'
    name: Test check_binary.sh for Linux CUDA
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
    with:
      runner: linux.4xlarge.nvidia.gpu
      docker-image: python:3.11
--- a/.github/workflows/trymerge.yml
+++ b/.github/workflows/trymerge.yml
@ -7,7 +7,7 @@ on:
 jobs:
  do_merge:
    name: try_merge_pr_${{ github.event.client_payload.pr_num }}
-    runs-on: linux.24_04.4x
+    runs-on: linux.20_04.4x
    environment: mergebot
    permissions:
      id-token: write
--- a/RELEASE.md
+++ b/RELEASE.md
@ -19,7 +19,7 @@
    - [Cherry Picking Fixes](#cherry-picking-fixes)
      - [How to do Cherry Picking](#how-to-do-cherry-picking)
    - [Cherry Picking Reverts](#cherry-picking-reverts)
-  - [Preparing and Creating Final Release Candidate](#preparing-and-creating-final-release-candidate)
+  - [Preparing and Creating Final Release candidate](#preparing-and-creating-final-release-candidate)
  - [Promoting RCs to Stable](#promoting-rcs-to-stable)
  - [Additional Steps to prepare for release day](#additional-steps-to-prepare-for-release-day)
    - [Modify release matrix](#modify-release-matrix)
@ -63,7 +63,7 @@ Following is the Release Compatibility Matrix for PyTorch releases:

 ## Release Cadence

-Following is the release cadence. All future dates below are tentative. For latest updates on the release schedule, please follow [dev discuss](https://dev-discuss.pytorch.org/c/release-announcements/27). Please note: Patch Releases are optional.
+Following is the release cadence. All future dates below are tentative, for latest updates on the release scheduled please follow [dev discuss](https://dev-discuss.pytorch.org/c/release-announcements/27). Please note: Patch Releases are optional.

 | Minor Version | Release branch cut | Release date | First patch release date | Second patch release date|
 | --- | --- | --- | --- | --- |
@ -91,20 +91,20 @@ Releasing a new version of PyTorch generally entails 3 major steps:

 ### Frequently Asked Questions

-* Q: What is a release branch cut  ?
+* Q: What is release branch cut  ?
  * A: When bulk of the tracked features merged into the main branch, the primary release engineer starts the release process of cutting the release branch by creating a new git branch based off of the current `main` development branch of PyTorch. This allows PyTorch development flow on `main` to continue uninterrupted, while the release engineering team focuses on stabilizing the release branch in order to release a series of release candidates (RC). The activities in the release branch include both regression and performance testing as well as polishing new features and fixing release-specific bugs. In general, new features *are not* added to the release branch after it was created.

-* Q: What is a cherry-pick ?
+* Q: What is cherry-pick ?
  * A: A cherry pick is a process of propagating commits from the main into the release branch, utilizing git's built in [cherry-pick feature](https://git-scm.com/docs/git-cherry-pick). These commits are typically limited to small fixes or documentation updates to ensure that the release engineering team has sufficient time to complete a thorough round of testing on the release branch. To nominate a fix for cherry-picking, a separate pull request must be created against the respective release branch and then mentioned in the Release Tracker issue (example: https://github.com/pytorch/pytorch/issues/94937) following the template from the issue description. The comment nominating a particular cherry-pick for inclusion in the release should include the committed PR against main branch, the newly created cherry-pick PR, as well as the acceptance criteria for why the cherry-pick is needed in the first place.

 ## Cutting a release branch preparations

-Following requirements need to be met prior to cutting a release branch:
+Following Requirements needs to be met prior to cutting a release branch:

-* Resolve all outstanding issues in the milestones (for example [1.11.0](https://github.com/pytorch/pytorch/milestone/28)) before first RC cut is completed. After RC cut is completed, the following script should be executed from test-infra repo in order to validate the presence of the fixes in the release branch:
+* Resolve all outstanding issues in the milestones(for example [1.11.0](https://github.com/pytorch/pytorch/milestone/28))before first RC cut is completed. After RC cut is completed following script should be executed from test-infra repo in order to validate the presence of the fixes in the release branch :
 ``` python github_analyze.py --repo-path ~/local/pytorch --remote upstream --branch release/1.11 --milestone-id 26 --missing-in-branch ```
-* Validate that all new workflows have been created in the PyTorch and domain libraries included in the release. Validate it against all dimensions of release matrix, including operating systems (Linux, MacOS, Windows), Python versions as well as CPU architectures (x86 and arm) and accelerator versions (CUDA, ROCm, XPU).
-* All the nightly jobs for pytorch and domain libraries should be green. Validate this using the following HUD links:
+* Validate that all new workflows have been created in the PyTorch and domain libraries included in the release. Validate it against all dimensions of release matrix, including operating systems(Linux, MacOS, Windows), Python versions as well as CPU architectures(x86 and arm) and accelerator versions(CUDA, ROCm, XPU).
+* All the nightly jobs for pytorch and domain libraries should be green. Validate this using following HUD links:
  * [Pytorch](https://hud.pytorch.org/hud/pytorch/pytorch/nightly)
  * [TorchVision](https://hud.pytorch.org/hud/pytorch/vision/nightly)
  * [TorchAudio](https://hud.pytorch.org/hud/pytorch/audio/nightly)
@ -224,12 +224,12 @@ Backups are stored in a non-public S3 bucket at [`s3://pytorch-backup`](https://

 ### Release Candidate health validation

-Validate that the release jobs for pytorch and domain libraries are green. Validate this using the following HUD links:
+Validate the release jobs for pytorch and domain libraries should be green. Validate this using following HUD links:
  * [Pytorch](https://hud.pytorch.org/hud/pytorch/pytorch/release%2F1.12)
  * [TorchVision](https://hud.pytorch.org/hud/pytorch/vision/release%2F1.12)
  * [TorchAudio](https://hud.pytorch.org/hud/pytorch/audio/release%2F1.12)

-Validate that the documentation build has completed and generated an entry corresponding to the release in the [docs repository](https://github.com/pytorch/docs/tree/main/).
+Validate that the documentation build has completed and generated entry corresponding to the release in  [docs repository](https://github.com/pytorch/docs/tree/main/).

 ### Cherry Picking Fixes

@ -274,15 +274,15 @@ requires `pytorchbot`, so it's only available in PyTorch atm.

 ### Cherry Picking Reverts

-If a PR that has been cherry-picked into the release branch has been reverted, its cherry-pick must be reverted as well.
+If PR that has been cherry-picked into release branch has been reverted, its cherry-pick must be reverted as well.

-Reverts for changes that were committed into the main branch prior to the branch cut must be propagated into the release branch as well.
+Reverts for changes that was committed into the main branch prior to the branch cut, must be propagated into release branch as well.

-## Preparing and Creating Final Release Candidate
+## Preparing and Creating Final Release candidate

-The following requirements need to be met prior to creating the final Release Candidate:
+The following requirements need to be met prior to creating final Release Candidate :

-* Resolve all outstanding open issues in the milestone. There should be no open issues/PRs (for example [2.1.2](https://github.com/pytorch/pytorch/milestone/39)). Each issue should either be closed or de-milestoned.
+* Resolve all outstanding open issues in the milestone. There should be no open issues/PRs (for example [2.1.2](https://github.com/pytorch/pytorch/milestone/39)). The issue should either be closed or de-milestoned.

 * Validate that all closed milestone PRs are present in the release branch. Confirm this by running:
 ``` python github_analyze.py --repo-path ~/local/pytorch --remote upstream --branch release/2.2 --milestone-id 40 --missing-in-branch ```
@ -291,7 +291,7 @@ The following requirements need to be met prior to creating the final Release Ca

 * Perform [Release Candidate health validation](#release-candidate-health-validation). CI should have the green signal.

-After the final RC is created, the following tasks should be performed:
+After the final RC is created. The following tasks should be performed :

 * Perform [Release Candidate health validation](#release-candidate-health-validation). CI should have the green signal.

@ -323,25 +323,25 @@ Promotion should occur in two steps:

 ## Additional Steps to prepare for release day

-The following should be prepared for the release day:
+The following should be prepared for the release day

 ### Modify release matrix

-Modify the release matrix for the get started page. See the following [PR](https://github.com/pytorch/test-infra/pull/4611) as reference.
+Need to modify release matrix for get started page. See following [PR](https://github.com/pytorch/test-infra/pull/4611) as reference.

-The PR to update published_versions.json and quick-start-module.js is auto generated. See the following [PR](https://github.com/pytorch/pytorch.github.io/pull/1467) as reference.
+The PR to update published_versions.json and quick-start-module.js is auto generated. See following [PR](https://github.com/pytorch/pytorch.github.io/pull/1467) as reference.

-Please note: This PR needs to be merged on the release day and hence it should be absolutely free of any failures. To test this PR, open another test PR pointing to the Release Candidate location as described in the [Release Candidate Storage](#release-candidate-storage) section.
+Please note: This PR needs to be merged on the release day and hence it should be absolutely free of any failures. To test this PR, open another test PR but pointing to the Release candidate location as above [Release Candidate Storage](RELEASE.md#release-candidate-storage)

 ### Open Google Colab issue

-This is normally done right after the release is completed. We need to create a Google Colab issue. See the following example [issue](https://github.com/googlecolab/colabtools/issues/2372)
+This is normally done right after the release is completed. We would need to create Google Colab Issue see following [PR](https://github.com/googlecolab/colabtools/issues/2372)

 # Patch Releases

 A patch release is a maintenance release of PyTorch that includes fixes for regressions found in a previous minor release. Patch releases typically will bump the `patch` version from semver (i.e. `[major].[minor].[patch]`).

-Please note: Starting from 2.1, one can expect up to 2 patch releases after every minor release. Patch releases are only published for the latest minor release.
+Please note: Starting from 2.1 one can expect up to 2 patch releases after every minor ones. Patch releases would only be published for latest minor release.

 ## Patch Release Criteria

@ -363,29 +363,29 @@ Patch releases should be considered if a regression meets the following criteria
 > Main POC: Patch Release Managers, Triage Reviewers

 Patch releases should follow these high-level phases. This process starts immediately after the previous release has completed.
-The patch release process takes around 4-5 weeks to complete.
+Patch release process takes around 4-5 weeks to complete.

-1. Triage is a process where issues are identified, graded, compared to Patch Release Criteria and added to Patch Release milestone. This process normally takes 2 weeks after the release completion.
+1. Triage, is a process where issues are identified, graded, compared to Patch Release Criteria and added to Patch Release milestone. This process normally takes 2 weeks after the release completion.
 2. Go/No Go meeting between PyTorch Releng, PyTorch Core and Project Managers where potential issues triggering a release in milestones are reviewed, and following decisions are made:
-  * Should the new patch release be created?
+  * Should the new patch Release be created ?
  * Timeline execution for the patch release
-3. Cherry picking phase starts after the decision is made to create a patch release. At this point, a new release tracker for the patch release is created, and an announcement will be made on official channels [example announcement](https://dev-discuss.pytorch.org/t/pytorch-release-2-0-1-important-information/1176). The authors of the fixes to regressions will be asked to create their own cherry picks. This process normally takes 2 weeks.
-4. Building Binaries, Promotion to Stable and testing. After all cherry picks have been merged, Release Managers trigger a new build and produce a new release candidate. An announcement is made on the official channel about the RC availability at this point. This process normally takes 2 weeks.
+3. Cherry picking phase starts after the decision is made to create patch release. At this point a new release tracker for the patch release is created, and an announcement will be made on official channels [example announcement](https://dev-discuss.pytorch.org/t/pytorch-release-2-0-1-important-information/1176). The authors of the fixes to regressions will be asked to create their own cherry picks. This process normally takes 2 weeks.
+4. Building Binaries, Promotion to Stable and testing. After all cherry picks have been merged, Release Managers trigger new build and produce new release candidate. Announcement is made on the official channel about the RC availability at this point. This process normally takes 2 weeks.
 5. General Availability

 ### Triage

 > Main POC: Triage Reviewers

-1. Tag issues/pull requests that are candidates for a potential patch release with `triage review`
+1. Tag issues / pull requests that are candidates for a potential patch release with `triage review`
    * ![adding triage review label](https://user-images.githubusercontent.com/1700823/132589089-a9210a14-6159-409d-95e5-f79067f6fa38.png)
-2. Triage reviewers will then check if the regression/fix identified fits within the above mentioned [Patch Release Criteria](#patch-release-criteria)
-3. Triage reviewers will then add the issue/pull request to the related milestone (i.e. `1.9.1`) if the regression is found to be within the [Patch Release Criteria](#patch-release-criteria)
+2. Triage reviewers will then check if the regression / fix identified fits within above mentioned [Patch Release Criteria](#patch-release-criteria)
+3. Triage reviewers will then add the issue / pull request to the related milestone (i.e. `1.9.1`) if the regressions is found to be within the [Patch Release Criteria](#patch-release-criteria)
    * ![adding to milestone](https://user-images.githubusercontent.com/1700823/131175980-148ff38d-44c3-4611-8a1f-cd2fd1f4c49d.png)

 ### Issue Tracker for Patch releases

-For patch releases, an issue tracker needs to be created. For a patch release, we require all cherry-pick changes to have links to either a high-priority GitHub issue or a CI failure from previous RC. An example of this would look like:
+For patch releases issue tracker needs to be created. For patch release, we require all cherry-pick changes to have links to either a high-priority GitHub issue or a CI failure from previous RC. An example of this would look like:
 * https://github.com/pytorch/pytorch/issues/128436

 Only following issues are accepted:
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -343,32 +343,9 @@ if(USE_CUDA)
 endif()

 if(USE_ROCM)
-  # NOTE: The PyTorch build does not actually add_subdirectory
-  # third_party/composable_kernel or use it as a CMake library. What is used
-  # is header only, so this should be ok, except that the CMake build generates
-  # a ck/config.h. We just do that part here. Without this, the ck.h from the
-  # ROCM SDK may get accidentally used instead.
-  function(_pytorch_rocm_generate_ck_conf)
-    set(CK_ENABLE_INT8 "ON")
-    set(CK_ENABLE_FP16 "ON")
-    set(CK_ENABLE_FP32 "ON")
-    set(CK_ENABLE_FP64 "ON")
-    set(CK_ENABLE_BF16 "ON")
-    set(CK_ENABLE_FP8 "ON")
-    set(CK_ENABLE_BF8 "ON")
-    set(CK_USE_XDL "ON")
-    set(CK_USE_WMMA "ON")
-    configure_file(
-      "${Torch_SOURCE_DIR}/third_party/composable_kernel/include/ck/config.h.in"
-      "${CMAKE_CURRENT_BINARY_DIR}/composable_kernel/ck/config.h"
-      )
-  endfunction()
  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/hip)
  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include)
  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include)
-  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel)
-  _pytorch_rocm_generate_ck_conf()
-
  # Next two lines are needed because TunableOp uses third-party/fmt
  list(APPEND ATen_HIP_INCLUDE $<TARGET_PROPERTY:fmt::fmt-header-only,INTERFACE_INCLUDE_DIRECTORIES>)
  list(APPEND ATen_HIP_DEPENDENCY_LIBS fmt::fmt-header-only)
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -110,11 +110,6 @@ class TORCH_API Context {

  Allocator* getPinnedMemoryAllocator(
      std::optional<c10::DeviceType> device_type = std::nullopt) {
-    auto opt_device_type =
-        device_type.has_value() ? device_type : at::getAccelerator();
-    if (opt_device_type) {
-      lazyInitDevice(opt_device_type.value());
-    }
    return getAcceleratorHooksInterface(device_type).getPinnedMemoryAllocator();
  }

--- a/aten/src/ATen/EmptyTensor.cpp
+++ b/aten/src/ATen/EmptyTensor.cpp
@ -28,8 +28,10 @@ c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) {
      opt_device_type = at::getAccelerator(false);
    }
    if (opt_device_type.has_value()) {
-      return at::globalContext().getPinnedMemoryAllocator(
-          opt_device_type.value());
+      at::globalContext().lazyInitDevice(opt_device_type.value());
+      return at::globalContext()
+          .getAcceleratorHooksInterface(opt_device_type)
+          .getPinnedMemoryAllocator();
    } else {
      TORCH_CHECK(
          false, "Need to provide pin_memory allocator to use pin memory.")
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@ -64,7 +64,7 @@ thread_local std::array<at::ScalarType, at::COMPILE_TIME_MAX_DEVICE_TYPES>
        at::ScalarType::Undefined, // IDEEP.
        at::kHalf, // AMD HIP
        at::ScalarType::Undefined, // FPGA
-        at::kBFloat16, // ONNX Runtime / Microsoft
+        at::ScalarType::Undefined, // ONNX Runtime / Microsoft
        at::kBFloat16, // XLA / TPU
        at::ScalarType::Undefined, // Vulkan
        at::ScalarType::Undefined, // Metal
@ -500,44 +500,6 @@ TORCH_LIBRARY_IMPL(aten, AutocastMTIA, m) {
         TORCH_FN((&at::autocast::binary_cross_entropy_banned)));
 }

-// MAIA
-TORCH_LIBRARY_IMPL(_, AutocastMAIA, m) {
-  m.fallback(torch::CppFunction::makeFallthrough());
-}
-
-TORCH_LIBRARY_IMPL(aten, AutocastMAIA, m) {
-  // lower_precision_fp
-#define _KERNEL_MAIA_LOW_PRECISION_FP(...) \
-  KERNEL_MAIA(__VA_ARGS__, lower_precision_fp)
-
-  AT_FORALL_LOWER_PRECISION_FP(_KERNEL_MAIA_LOW_PRECISION_FP)
-
-  // fp32
-#define _KERNEL_MAIA_FP32(...) KERNEL_MAIA(__VA_ARGS__, fp32)
-
-  AT_FORALL_FP32(_KERNEL_MAIA_FP32)
-
-  // fp32_set_opt_dtype
-#define _KERNEL_MAIA_FP32_SET_OPT_DTYPE(...) \
-  KERNEL_MAIA(__VA_ARGS__, fp32_set_opt_dtype)
-
-  AT_FORALL_FP32_SET_OPT_DTYPE(_KERNEL_MAIA_FP32_SET_OPT_DTYPE)
-
-  // fp32_append_dtype
-  // The fp32_append_dtype wrapper overrides implicit promotion behavior.
-  // norm does not implicitly promote, but be aware when adding new ops to this policy.
-  AT_FORALL_DIFFERENT_REDISPATCH_SIGNATURE(
-      KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_MAIA)
-
-  // promote
-#define _KERNEL_MAIA_PROMOTE(...) KERNEL_MAIA(__VA_ARGS__, promote)
-
-  AT_FORALL_PROMOTE(_KERNEL_MAIA_PROMOTE)
-
-  m.impl(TORCH_SELECTIVE_NAME("aten::binary_cross_entropy"),
-         TORCH_FN((&at::autocast::binary_cross_entropy_banned)));
-}
-
 // XPU
 TORCH_LIBRARY_IMPL(_, AutocastXPU, m) {
  m.fallback(torch::CppFunction::makeFallthrough());
--- a/aten/src/ATen/autocast_mode.h
+++ b/aten/src/ATen/autocast_mode.h
@ -126,11 +126,10 @@ TORCH_API inline void set_autocast_gpu_dtype(at::ScalarType dtype) {
 // NOLINTNEXTLINE(misc-use-internal-linkage)
 AT_FORALL_DEPRECATED_AUTOCAST_BACKENDS(DECLARE_DEPRECATED_AUTOCAST_APIS)

-const std::array<at::DeviceType, 10> _AUTOCAST_SUPPORTED_DEVICES{
+const std::array<at::DeviceType, 9> _AUTOCAST_SUPPORTED_DEVICES{
    at::kCPU,
    at::kCUDA,
    at::kMTIA,
-    at::kMAIA,
    at::kXPU,
    at::kIPU,
    at::kHPU,
@ -151,8 +150,6 @@ inline bool is_autocast_eligible(
          tensor.is_floating_point();
    case c10::DeviceType::MTIA:
      return tensor.is_mtia() && tensor.is_floating_point();
-    case c10::DeviceType::MAIA:
-      return tensor.is_maia() && tensor.is_floating_point();
    case c10::DeviceType::XPU:
      return tensor.is_xpu() && tensor.is_floating_point();
    case c10::DeviceType::IPU:
@ -180,8 +177,6 @@ inline DispatchKey get_autocast_dispatch_key_from_device_type(
      return DispatchKey::AutocastCPU;
    case c10::DeviceType::MTIA:
      return DispatchKey::AutocastMTIA;
-    case c10::DeviceType::MAIA:
-      return DispatchKey::AutocastMAIA;
    case c10::DeviceType::XPU:
      return DispatchKey::AutocastXPU;
    case c10::DeviceType::IPU:
@ -753,24 +748,6 @@ copy pasted in from VariableTypeEverything.cpp with appropriate substitutions.
      REDISPATCH_SIGNATURE,                         \
      POLICY)

-// KERNEL_MAIA/KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_MAIA
-// registration (OP, POLICY) or (OP, OVERLOAD, POLICY) for AutocastMAIA
-#define KERNEL_MAIA(...) KERNEL(c10::DeviceType::MAIA, __VA_ARGS__)
-
-#define KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_MAIA( \
-    REDISPATCH_FUNC,                                \
-    REGISTER_NAME,                                  \
-    REGISTER_SIGNATURE,                             \
-    REDISPATCH_SIGNATURE,                           \
-    POLICY)                                         \
-  KERNEL_DIFFERENT_REDISPATCH_SIGNATURE(            \
-      c10::DeviceType::MAIA,                        \
-      REDISPATCH_FUNC,                              \
-      REGISTER_NAME,                                \
-      REGISTER_SIGNATURE,                           \
-      REDISPATCH_SIGNATURE,                         \
-      POLICY)
-
 // KERNEL_XPU/KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_XPU
 // registration (OP, POLICY) or (OP, OVERLOAD, POLICY) for AutocastXPU
 #define KERNEL_XPU(...) KERNEL(c10::DeviceType::XPU, __VA_ARGS__)
--- a/aten/src/ATen/core/VariableFallbackKernel.cpp
+++ b/aten/src/ATen/core/VariableFallbackKernel.cpp
@ -80,10 +80,6 @@ TORCH_LIBRARY_IMPL(_, AutogradMTIA, m) {
  m.fallback(AUTOGRAD_FALLBACK);
 }

-TORCH_LIBRARY_IMPL(_, AutogradMAIA, m) {
-  m.fallback(AUTOGRAD_FALLBACK);
-}
-
 TORCH_LIBRARY_IMPL(_, AutogradXLA, m) {
  m.fallback(AUTOGRAD_FALLBACK);
 }
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -1079,13 +1079,7 @@ void gemm_internal<float>(CUDABLAS_GEMM_ARGTYPES(float))
  }
 #ifdef USE_ROCM
  else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
-    auto dprops = at::cuda::getCurrentDeviceProperties();
-    c10::string_view arch(dprops->gcnArchName);
-    if (arch == "gfx1100") { //no CK GEMM version for gfx1100
-      gemm_internal_cublaslt<float>(CUDABLAS_GEMM_ARGS(float));
-    } else{
-      at::native::gemm_internal_ck<float>(CUDABLAS_GEMM_ARGS(float));
-    }
+    at::native::gemm_internal_ck<float>(CUDABLAS_GEMM_ARGS(float));
  }
 #endif
  else {
--- a/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
+++ b/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
@ -156,7 +156,6 @@ NVRTC_STUB2(nvrtcGetProgramLogSize,nvrtcProgram, size_t*)
 NVRTC_STUB2(nvrtcGetProgramLog, nvrtcProgram, char *)
 NVRTC_STUB3(nvrtcGetLoweredName, nvrtcProgram, const char *, const char **)

-CUDA_STUB2(cuModuleLoad, CUmodule*, const char*)
 CUDA_STUB2(cuModuleLoadData, CUmodule *, const void *)
 CUDA_STUB3(cuModuleGetFunction, CUfunction *, CUmodule, const char *)
 CUDA_STUB4(cuOccupancyMaxActiveBlocksPerMultiprocessor, int *, CUfunction, int, size_t)
@ -170,8 +169,6 @@ CUDA_STUB4(cuLinkCreate, unsigned int, CUjit_option *, void **, CUlinkState *)
 CUDA_STUB3(cuLinkComplete, CUlinkState, void **, size_t *)
 CUDA_STUB3(cuFuncSetAttribute, CUfunction, CUfunction_attribute, int)
 CUDA_STUB3(cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction)
-CUDA_STUB3(cuPointerGetAttribute, void*, CUpointer_attribute, CUdeviceptr)
-

 #if defined(CUDA_VERSION) && CUDA_VERSION >= 12000
 CUresult CUDAAPI
--- a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
+++ b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
@ -43,7 +43,6 @@ namespace at::cuda {
  _(nvrtcGetProgramLogSize)                      \
  _(nvrtcGetProgramLog)                          \
  _(nvrtcGetLoweredName)                         \
-  _(cuModuleLoad)                                \
  _(cuModuleLoadData)                            \
  _(cuModuleLoadDataEx)                          \
  _(cuModuleGetFunction)                         \
@ -61,7 +60,6 @@ namespace at::cuda {
  _(cuLinkComplete)                              \
  _(cuFuncSetAttribute)                          \
  _(cuFuncGetAttribute)                          \
-  _(cuPointerGetAttribute)                       \

 #if defined(CUDA_VERSION) && CUDA_VERSION >= 12000
 #define AT_FORALL_NVRTC_EXTENDED(_)              \
--- a/aten/src/ATen/cuda/tunable/GemmCommon.h
+++ b/aten/src/ATen/cuda/tunable/GemmCommon.h
@ -575,20 +575,11 @@ struct ScaledGemmParams : OpParams {

  std::string BLASSignature() const override {
    // Excluding use_fast_accum and use_rowise booleans for now
-    if (bias_ptr == nullptr) {
-      return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: 0, stride_b: 0, stride_c: 0, stride_d: 0, "
-        "transA: %c, transB: %c, batch_count: 1, scaleA: f32_r, scaleB: f32_r, a_type: %s, b_type: %s, c_type: %s, d_type: %s, scale_type: %s, compute_type: %s }",
-        m, n, k, lda, ldb, ldc, ldc, transa, transb,
-        ScalarTypeToBLASType(a_dtype), ScalarTypeToBLASType(b_dtype), ScalarTypeToBLASType(c_dtype), ScalarTypeToBLASType(c_dtype),
-        ComputeTypeFor<T>(), ComputeTypeFor<T>());
-    }
-    else {
-      return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: 0, stride_b: 0, stride_c: 0, stride_d: 0, "
-        "transA: %c, transB: %c, batch_count: 1, scaleA: f32_r, scaleB: f32_r, a_type: %s, b_type: %s, c_type: %s, d_type: %s, bias_type: %s, scale_type: %s, compute_type: %s }",
-        m, n, k, lda, ldb, ldc, ldc, transa, transb,
-        ScalarTypeToBLASType(a_dtype), ScalarTypeToBLASType(b_dtype), ScalarTypeToBLASType(c_dtype), ScalarTypeToBLASType(c_dtype), ScalarTypeToBLASType(bias_dtype),
-        ComputeTypeFor<T>(), ComputeTypeFor<T>());
-    }
+    return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: 0, stride_b: 0, stride_c: 0, stride_d: 0, "
+      "transA: %c, transB: %c, batch_count: 1, scaleA: f32_r, scaleB: f32_r, a_type: %s, b_type: %s, c_type: %s, d_type: %s, bias_type: %s, scale_type: %s, compute_type: %s }",
+      m, n, k, lda, ldb, ldc, ldc, transa, transb,
+      ScalarTypeToBLASType(a_dtype), ScalarTypeToBLASType(b_dtype), ScalarTypeToBLASType(c_dtype), ScalarTypeToBLASType(c_dtype), ScalarTypeToBLASType(bias_dtype),
+      ComputeTypeFor<T>(), ComputeTypeFor<T>());
  }

  std::string Signature() const override {
--- a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
+++ b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
@ -498,11 +498,7 @@ class HipblasltGemmOp : public Callable<ParamsT> {
            mat_c, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_c, sizeof(stride_c)));
      }

-      hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F;
-      if (at::globalContext().allowTF32CuBLAS()) {
-        computeType = HIPBLAS_COMPUTE_32F_FAST_TF32;
-      }
-      HipBlasLtMatmulDescriptor matmul(computeType, HIP_R_32F);
+      HipBlasLtMatmulDescriptor matmul(HIPBLAS_COMPUTE_32F, HIP_R_32F);
      matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSA, opa);
      matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSB, opb);

@ -615,11 +611,6 @@ auto GetHipBlasLtTypeStringAndOps() {
  auto in_out_datatype = HipDataTypeFor<CT>();
  std::vector<hipblasLtMatmulHeuristicResult_t> heuristic_result;

-  hipblasComputeType_t computeType = HIPBLAS_COMPUTE_32F;
-  if (at::globalContext().allowTF32CuBLAS()) {
-    computeType = HIPBLAS_COMPUTE_32F_FAST_TF32;
-  }
-
  hipblasLtHandle_t handle;
  TORCH_HIPBLASLT_CHECK(hipblasLtCreate(&handle));
  TORCH_HIPBLASLT_CHECK(hipblaslt_ext::getAllAlgos(handle,
@ -630,7 +621,7 @@ auto GetHipBlasLtTypeStringAndOps() {
        b_datatype,
        in_out_datatype,
        in_out_datatype,
-        computeType,
+        HIPBLAS_COMPUTE_32F,
        heuristic_result));
  TORCH_HIPBLASLT_CHECK(hipblasLtDestroy(handle));

--- a/aten/src/ATen/cuda/tunable/GemmRocblas.h
+++ b/aten/src/ATen/cuda/tunable/GemmRocblas.h
@ -141,8 +141,6 @@ class RocblasGemmOp : public Callable<GemmParams<T>> {

    TuningStatus Call(const GemmParams<T>* params) override {
      auto input_output_type = RocBlasDataTypeFor<T>();
-      if (at::globalContext().allowTF32CuBLAS() && input_output_type == rocblas_datatype_f32_r)
-        return FAIL;  // no support for TF32 in rocBLAS
      auto compute_type = RocBlasComputeTypeFor<T>();
      auto h_a = DoCastForHalfOrBfloat16(params->alpha);
      auto h_b = DoCastForHalfOrBfloat16(params->beta);
@ -209,8 +207,6 @@ class RocblasGemmStridedBatchedOp : public Callable<GemmStridedBatchedParams<T>>

    TuningStatus Call(const GemmStridedBatchedParams<T>* params) override {
      auto input_output_type = RocBlasDataTypeFor<T>();
-      if (at::globalContext().allowTF32CuBLAS() && input_output_type == rocblas_datatype_f32_r)
-        return FAIL;  // no support for TF32 in rocBLAS
      auto compute_type = RocBlasComputeTypeFor<T>();
      auto h_a = DoCastForHalfOrBfloat16(params->alpha);
      auto h_b = DoCastForHalfOrBfloat16(params->beta);
--- a/aten/src/ATen/native/CPUBlas.cpp
+++ b/aten/src/ATen/native/CPUBlas.cpp
@ -322,24 +322,6 @@ void gemm(
   const float beta,
   at::BFloat16 *c, int64_t ldc) {
   internal::normalize_last_dims(transa, transb, m, n, k, &lda, &ldb, &ldc);
-#if AT_MKLDNN_ENABLED()
-#ifdef __aarch64__
-   // MKLDNN also supports ARM for bf16, and the bypass is only
-   // currently intended for x86/x86_64.
-   const bool use_bf16_gemv_trans = false;
-#elif defined(__powerpc__)
-   const bool use_bf16_gemv_trans = false;
-#else
-   const bool bf16_gemv_trans_would_be_faster = cpuinfo_initialize() &&
-     !cpuinfo_has_x86_avx512bf16();
-   const bool use_bf16_gemv_trans = bf16_gemv_trans_would_be_faster &&
-     transa == TransposeType::Transpose &&
-     transb == TransposeType::NoTranspose && n == 1 && alpha == 1.0;
-#endif
-   if (!use_bf16_gemv_trans && mkldnn_bf16_gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)) {
-     return;
-   }
-#endif
 #if AT_BUILD_WITH_BLAS() && defined(BLAS_HAS_SBGEMM)
   if (use_blas_gemm(transa, transb, m, n, k, lda, ldb, ldc)) {
      int m_ = m, n_ = n, k_ = k, lda_ = lda, ldb_ = ldb, ldc_ = ldc;
@ -360,6 +342,24 @@ void gemm(
      }
      return;
   }
+#endif
+#if AT_MKLDNN_ENABLED()
+#ifdef __aarch64__
+   // MKLDNN also supports ARM for bf16, and the bypass is only
+   // currently intended for x86/x86_64.
+   const bool use_bf16_gemv_trans = false;
+#elif defined(__powerpc__)
+   const bool use_bf16_gemv_trans = false;
+#else
+   const bool bf16_gemv_trans_would_be_faster = cpuinfo_initialize() &&
+     !cpuinfo_has_x86_avx512bf16();
+   const bool use_bf16_gemv_trans = bf16_gemv_trans_would_be_faster &&
+     transa == TransposeType::Transpose &&
+     transb == TransposeType::NoTranspose && n == 1 && alpha == 1.0;
+#endif
+   if (!use_bf16_gemv_trans && mkldnn_bf16_gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)) {
+     return;
+   }
 #endif
   gemm_stub(
      at::kCPU, at::kBFloat16,
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@ -3610,11 +3610,11 @@ Tensor& transpose_(Tensor& self, int64_t dim0, int64_t dim1) {
    return at::_mkldnn_transpose_(self, dim0, dim1);
  }

-  SymDimVector sizes(self.sym_sizes().begin(), self.sym_sizes().end());
-  std::swap(sizes[dim0], sizes[dim1]);
-  SymDimVector strides(self.sym_strides().begin(), self.sym_strides().end());
+  DimVector sizes(self.sizes().begin(), self.sizes().end());
+  DimVector strides(self.strides().begin(), self.strides().end());
  std::swap(strides[dim0], strides[dim1]);
-  auto result = self.as_strided__symint(std::move(sizes), std::move(strides));
+  std::swap(sizes[dim0], sizes[dim1]);
+  self.as_strided_(sizes, strides);
  return self;
 }

--- a/aten/src/ATen/native/cpu/Activation.cpp
+++ b/aten/src/ATen/native/cpu/Activation.cpp
@ -832,9 +832,9 @@ void hardswish_backward_kernel(TensorIterator& iter) {
    cpu_kernel_vec(
      iter,
      [&](scalar_t grad_val, scalar_t self_val) -> scalar_t {
-        if (float(self_val) <= neg_three) {
+        if (float(self_val) < neg_three) {
          return zero;
-        } else if (float(self_val) < three) {
+        } else if (float(self_val) <= three) {
          return float(grad_val) * ((float(self_val) / three) + one_half);
        } else {
          return grad_val;
@ -847,19 +847,19 @@ void hardswish_backward_kernel(TensorIterator& iter) {
          Vec::blendv(
            grad_val0 * ((self_val0 / kThreeVec) + kOneHalfVec),
            grad_val0,
-            self_val0 >= kThreeVec
+            self_val0 > kThreeVec
          ),
          kZeroVec,
-          self_val0 <= kNegThreeVec
+          self_val0 < kNegThreeVec
        );
        self_val1 = Vec::blendv(
          Vec::blendv(
            grad_val1 * ((self_val1 / kThreeVec) + kOneHalfVec),
            grad_val1,
-            self_val1 >= kThreeVec
+            self_val1 > kThreeVec
          ),
          kZeroVec,
-          self_val1 <= kNegThreeVec
+          self_val1 < kNegThreeVec
        );
        return convert_from_float<scalar_t>(self_val0, self_val1);
      });
@ -878,9 +878,9 @@ void hardswish_backward_kernel(TensorIterator& iter) {
    cpu_kernel_vec(
      iter,
      [&](scalar_t grad_val, scalar_t self_val) {
-        if (self_val <= neg_three) {
+        if (self_val < neg_three) {
          return zero;
-        } else if (self_val < three) {
+        } else if (self_val <= three) {
          return grad_val * ((self_val / three) + one_half);
        } else {
          return grad_val;
@ -891,10 +891,10 @@ void hardswish_backward_kernel(TensorIterator& iter) {
          Vec::blendv(
            grad_val * ((self_val / kThreeVec) + kOneHalfVec),
            grad_val,
-            self_val >= kThreeVec
+            self_val > kThreeVec
          ),
          kZeroVec,
-          self_val <= kNegThreeVec
+          self_val < kNegThreeVec
        );
      }
    );
--- a/aten/src/ATen/native/cpu/Gelu.h
+++ b/aten/src/ATen/native/cpu/Gelu.h
@ -1,12 +1,5 @@
 #pragma once

-// On Windows, math.h needs to be included with _USE_MATH_DEFINES defined to
-// access constants such as M_SQRT2 and M_2_SQRTPI.
-#ifdef _WIN32
-#define _USE_MATH_DEFINES
-#include <cmath>
-#endif // _WIN32
-
 #include <ATen/cpu/vec/vec.h>
 #include <c10/util/BFloat16.h> // For c10::is_reduced_floating_point_v.

--- a/aten/src/ATen/native/cuda/ActivationHardswishKernel.cu
+++ b/aten/src/ATen/native/cuda/ActivationHardswishKernel.cu
@ -45,9 +45,9 @@ void hardswish_backward_kernel(TensorIterator& iter) {
      [zero, three, neg_three, one_half]GPU_LAMBDA(scalar_t grad_val_, scalar_t self_val_) -> scalar_t {
        opmath_t grad_val = static_cast<opmath_t>(grad_val_);
        opmath_t self_val = static_cast<opmath_t>(self_val_);
-        if (self_val <= neg_three) {
+        if (self_val < neg_three) {
          return zero;
-        } else if (self_val < three) {
+        } else if (self_val <= three) {
          return grad_val * ((self_val / three) + one_half);
        } else {
          return grad_val;
--- a/aten/src/ATen/native/cuda/CUDALoops.cuh
+++ b/aten/src/ATen/native/cuda/CUDALoops.cuh
@ -51,23 +51,6 @@

 namespace at::native {

-#ifdef USE_ROCM
-// Custom configuration for vectorized elementwise kernel
-// with template instantiation.
-namespace vectorized_templated_config {
-constexpr int num_threads() {
-  return 512;
-}
-
-constexpr int elems_per_thread() {
-  return 32;
-}
-
-constexpr int block_work_size() {
-  return elems_per_thread() * num_threads();
-}
-} // namespace vectorized_templated_config
-#endif

 template <typename args_t, size_t... Is>
 constexpr auto sum_of_sizes(args_t args, std::index_sequence<Is...>) {
@ -272,139 +255,6 @@ static inline void launch_vectorized_kernel(
  }
 }

-#ifdef USE_ROCM
-template <
-    int vec_size,
-    typename func_t,
-    typename array_t,
-    typename inp_calc_t,
-    typename out_calc_t,
-    typename loader_t,
-    typename storer_t,
-    typename OutputType,
-    typename... InputTypes>
-C10_LAUNCH_BOUNDS_1(vectorized_templated_config::num_threads())
-__global__ void vectorized_templated_elementwise_kernel(
-    int N,
-    func_t f,
-    array_t data,
-    inp_calc_t inp_calc,
-    out_calc_t out_calc,
-    loader_t loader,
-    storer_t storer) {
-  int remaining =
-      N - vectorized_templated_config::block_work_size() * blockIdx.x;
-  if (remaining <
-      vectorized_templated_config::block_work_size()) { // if this block handles
-                                                        // the reminder,
-    // just do a naive unrolled loop
-    auto policy = memory::policies::unroll_base<
-        vectorized_templated_config::num_threads(),
-        array_t,
-        inp_calc_t,
-        out_calc_t,
-        loader_t,
-        storer_t,
-        vectorized_templated_config::elems_per_thread()>(
-        data, remaining, inp_calc, out_calc, loader, storer);
-    elementwise_kernel_helper(f, policy);
-  } else { // if this block has a full `block_work_size` data to handle, use
-           // vectorized memory access
-    elementwise_kernel_helper(
-        f,
-        memory::policies::vectorized_templated<
-            vec_size,
-            array_t,
-            vectorized_templated_config::elems_per_thread(),
-            vectorized_templated_config::num_threads(),
-            OutputType,
-            InputTypes...>(data));
-  }
-}
-
-// This function assume trivial 1d and supports template specialization
-// to avoid dynamic casting.
-// Input vectorization size is based on runtime information, i.e.
-// the actual data types of the input and output tensor and cannot
-// be determined using the functor type, as in regular non-templated
-// vectorized kernels. The caller is in charge of selecting the correct input
-// vectorization length.
-template <
-    typename func_t,
-    typename array_t,
-    typename inp_calc_t,
-    typename out_calc_t,
-    typename loader_t,
-    typename storer_t,
-    typename OutputType,
-    typename... InputTypes>
-static inline void launch_vectorized_templated_kernel(
-    int64_t N,
-    const func_t& f,
-    array_t data,
-    inp_calc_t ic,
-    out_calc_t oc,
-    loader_t l,
-    storer_t s) {
-  TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
-  using traits = function_traits<func_t>;
-  int64_t grid = (N + vectorized_templated_config::block_work_size() - 1) /
-      vectorized_templated_config::block_work_size();
-  auto stream = at::cuda::getCurrentCUDAStream();
-  int vec_size = memory::can_vectorize_up_to<func_t>(data);
-  switch (vec_size) {
-    case 8:
-      vectorized_templated_elementwise_kernel<
-          8,
-          func_t,
-          array_t,
-          inp_calc_t,
-          out_calc_t,
-          loader_t,
-          storer_t,
-          OutputType,
-          InputTypes...>
-          <<<grid, vectorized_templated_config::num_threads(), 0, stream>>>(
-              N, f, data, ic, oc, l, s);
-      C10_CUDA_KERNEL_LAUNCH_CHECK();
-      break;
-    case 4:
-      vectorized_templated_elementwise_kernel<
-          4,
-          func_t,
-          array_t,
-          inp_calc_t,
-          out_calc_t,
-          loader_t,
-          storer_t,
-          OutputType,
-          InputTypes...>
-          <<<grid, vectorized_templated_config::num_threads(), 0, stream>>>(
-              N, f, data, ic, oc, l, s);
-      C10_CUDA_KERNEL_LAUNCH_CHECK();
-      break;
-    case 2:
-      vectorized_templated_elementwise_kernel<
-          2,
-          func_t,
-          array_t,
-          inp_calc_t,
-          out_calc_t,
-          loader_t,
-          storer_t,
-          OutputType,
-          InputTypes...>
-          <<<grid, vectorized_templated_config::num_threads(), 0, stream>>>(
-              N, f, data, ic, oc, l, s);
-      C10_CUDA_KERNEL_LAUNCH_CHECK();
-      break;
-    default:
-      // vector size 1 is not handled as part of vectorize_templated kernel
-      TORCH_INTERNAL_ASSERT(false, "Unexpected vectorization size");
-  }
-}
-#endif
-
 template <
    typename func_t,
    typename array_t,
@ -542,46 +392,6 @@ void gpu_kernel_impl_nocast(TensorIteratorBase& iter, const func_t& f) {
  });
 }

-#ifdef USE_ROCM
-namespace {
-template <typename TupleLike, size_t arity, size_t arg_num = 0>
-struct check_types {
-  constexpr static inline bool check() {
-    if constexpr (arity != 2)
-      return false;
-    if constexpr (arg_num == 0) {
-      using SelectedType = std::tuple_element_t<arg_num, TupleLike>;
-      if constexpr (std::is_same_v<float, SelectedType>)
-        return check_types<TupleLike, arity, arg_num + 1>::check();
-    } else if constexpr (arg_num == 1) {
-      using SelectedType2 = std::tuple_element_t<arg_num, TupleLike>;
-      if constexpr (std::is_same_v<float, SelectedType2>)
-        return check_types<TupleLike, arity, arg_num + 1>::check();
-    }
-    return false;
-  }
-};
-
-// Bottom case: if we got this far, assume correct type matching except
-// when there are no arguments (arity == 0).
-template <typename TupleLike, size_t arity>
-struct check_types<TupleLike, arity, arity> {
-  constexpr static inline bool check() {
-    if constexpr (arity != 0)
-      return true;
-    return false;
-  }
-};
-
-template <typename TupleLike>
-struct check_types<TupleLike, 0, 0> {
-  constexpr static inline bool check() {
-    return false;
-  }
-};
-} // namespace
-#endif
-
 template <typename func_t>
 void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
  if (!needs_dynamic_casting<func_t>::check(iter)) {
@ -606,45 +416,6 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {

  if (contiguous) {
 #ifdef USE_ROCM
-    // Attempt to call specialized vectorized elementwise kernel
-    // that enables interleaving.
-    using float_map = c10::CppTypeToScalarType<float>;
-    using bfloat16_map = c10::CppTypeToScalarType<BFloat16>;
-    if (iter.ninputs() == 2 && iter.input_dtype(0) == float_map::value &&
-        iter.input_dtype(1) == bfloat16_map::value &&
-        memory::can_vectorize_up_to<func_t>(data) > 1) {
-      // constexpr to reduce the amount of kernels (empty) generated for
-      // vectorized templated elementwise and limit which functors are actually
-      // applied to the load and store at compile time.
-      using func_tuple = typename traits::ArgsTuple;
-      if constexpr (
-          std::is_same_v<float, arg0_t> && traits::arity == 2 &&
-          check_types<func_tuple, traits::arity, 0>::check()) {
-        auto input_offset_calculator = TrivialOffsetCalculator<traits::arity>();
-        auto output_offset_calculator = TrivialOffsetCalculator<1>();
-        auto loader = memory::LoadWithCast<traits::arity>(iter);
-        auto storer = memory::StoreWithCast<1>(iter);
-        launch_vectorized_templated_kernel<
-            func_t,
-            std::array<char*, ntensors>,
-            decltype(input_offset_calculator),
-            decltype(output_offset_calculator),
-            decltype(loader),
-            decltype(storer),
-            float,
-            float,
-            BFloat16>(
-            numel,
-            f,
-            data,
-            input_offset_calculator,
-            output_offset_calculator,
-            loader,
-            storer);
-        return;
-      }
-    }
-
    std::array<ScalarType, ntensors> dtypes;
    auto inner_strides = iter.get_inner_strides();
    std::array<int, ntensors> strides;
--- a/aten/src/ATen/native/cuda/MemoryAccess.cuh
+++ b/aten/src/ATen/native/cuda/MemoryAccess.cuh
@ -67,28 +67,6 @@ struct vectorized_load_helper {
  }
 };

-#ifdef USE_ROCM
-// Templated version of vectorized load helper.
-// It can be used on heterogeneous input tensor element types.
-template <int arg_index>
-struct vectorized_templated_load_helper {
-  template <typename args_t, typename policy_t>
-  static __device__ void apply(policy_t& self, args_t* args, int idx) {
-    using arg_t = std::tuple_element_t<arg_index, args_t>;
-    // `data` hold the data_ptr for tensors [output, input0, input1, ...], so we
-    // need a +1 offset to get the input
-
-    // Delay pointer arithmetic to the policy loader where we know the actual
-    // type of the current argument.
-    char* ptr = (self.data[arg_index + 1]);
-    auto args_accessor = [&args] __device__(int thread_unroll_idx) -> arg_t& {
-      return std::get<arg_index>(args[thread_unroll_idx]);
-    };
-    self.template load_single_arg<arg_index>(args_accessor, ptr, idx);
-  }
-};
-#endif
-
 template<int arg_index>
 struct unroll_load_helper {
  template <typename args_t, typename policy_t, typename offset_t, typename loader_t>
@ -203,16 +181,9 @@ __device__ aligned_vector<bool, vec_size> load_vector(const bool *base_ptr, uint

 namespace policies {

-template <
-    int num_threads,
-    typename data_t,
-    typename inp_calc_t,
-    typename out_calc_t,
-    typename loader_t,
-    typename storer_t,
-    int elems_per_thread,
-    int num_outputs = 1>
-struct unroll_base {
+template<typename data_t, typename inp_calc_t, typename out_calc_t, typename loader_t, typename storer_t, int elems_per_thread, int num_outputs=1>
+struct unroll {
+
  data_t data;
  int remaining;
  inp_calc_t input_offset_calculator;
@ -220,24 +191,12 @@ struct unroll_base {
  loader_t loader;
  storer_t storer;
  static constexpr int tws = elems_per_thread;
-  static constexpr int block_work_size = elems_per_thread * num_threads;

-  __device__ unroll_base(
-      data_t data,
-      int remaining,
-      inp_calc_t ic,
-      out_calc_t oc,
-      loader_t l,
-      storer_t s)
-      : data(data),
-        remaining(remaining),
-        input_offset_calculator(ic),
-        output_offset_calculator(oc),
-        loader(l),
-        storer(s) {}
+  __device__ unroll(data_t data, int remaining, inp_calc_t ic, out_calc_t oc, loader_t l, storer_t s):
+    data(data), remaining(remaining), input_offset_calculator(ic), output_offset_calculator(oc), loader(l), storer(s) {}

  __device__ inline bool check_inbounds(int thread_work_elem) {
-    return ((int)(threadIdx.x + thread_work_elem * num_threads) < remaining);
+    return ((int)(threadIdx.x  + thread_work_elem*num_threads()) < remaining);
  }

  template<typename args_t>
@ -246,13 +205,13 @@ struct unroll_base {
    int thread_idx = threadIdx.x;
    #pragma unroll
    for (int i = 0; i < elems_per_thread; i++) {
-      if (thread_idx < remaining) {
-        int linear_idx = thread_idx + block_work_size * idx;
-        auto offset = input_offset_calculator.get(linear_idx);
-        detail::static_unroll<detail::unroll_load_helper, arity>::with_args(
-            *this, args, offset, loader, i, num_outputs);
-        thread_idx += num_threads;
+      if (thread_idx >= remaining) {
+        return;
      }
+      int linear_idx = thread_idx + elems_per_thread * num_threads() * idx;
+      auto offset = input_offset_calculator.get(linear_idx);
+      detail::static_unroll<detail::unroll_load_helper, arity>::with_args(*this, args, offset, loader, i, num_outputs);
+      thread_idx += num_threads();
    }
  }

@ -261,36 +220,22 @@ struct unroll_base {
    int thread_idx = threadIdx.x;
    #pragma unroll
    for (int i = 0; i < elems_per_thread; i++) {
-      if (thread_idx < remaining) {
-        int linear_idx = thread_idx + block_work_size * idx;
-        int offset = output_offset_calculator.get(linear_idx)[0];
-        storer.store(from[i], data[0], offset);
-        thread_idx += num_threads;
+      if (thread_idx >= remaining) {
+        return;
      }
+      int linear_idx = thread_idx + elems_per_thread * num_threads() * idx;
+      int offset = output_offset_calculator.get(linear_idx)[0];
+      storer.store(from[i], data[0], offset);
+      thread_idx += num_threads();
    }
  }
 };

-// Utility type for all users of unroll that extract the num_threads value from
-// the caller scope.
-template <
-    typename data_t,
-    typename inp_calc_t,
-    typename out_calc_t,
-    typename loader_t,
-    typename storer_t,
-    int elems_per_thread,
-    int num_outputs = 1>
-using unroll = unroll_base<
-    num_threads(),
-    data_t,
-    inp_calc_t,
-    out_calc_t,
-    loader_t,
-    storer_t,
-    elems_per_thread,
-    num_outputs>;
-
+// Assumption:
+// all tensors are contiguous, that is: stride == sizeof(type) for all tensors
+// Note:
+// Functions in vectorized policy does not do boundary check. It assumes the whole block
+// has its job to do. So the reminders should be handled by the caller manually.
 template <int vec_size, typename data_t, int elems_per_thread>  // vec_size: number of scalars, can be 1, 2, or 4.
 struct vectorized {

@ -344,86 +289,6 @@ struct vectorized {
  }
 };

-#ifdef USE_ROCM
-// This is similar to vectorized policy above, but this one supports
-// heterogenous input tensor types as templated parameters.
-// Its use should be limited to frequently used heterogeneous data types
-// as each instantiation will generate a separate kernel, leading to code
-// bloating if applied to all combinations supported in PyTorch. Assumption: all
-// tensors are contiguous, that is: stride == sizeof(type) for all tensors.
-template <
-    int vec_size,
-    typename data_t,
-    int elems_per_thread,
-    int num_threads,
-    typename CastToT,
-    typename... CastFromTs> // vec_size: number of scalars, can be 1, 2, or 4.
-struct vectorized_templated {
-  static_assert(
-      elems_per_thread % vec_size == 0,
-      "The workload per thread must be a multiple of vec_size");
-  static constexpr int loop_size = elems_per_thread / vec_size;
-  static constexpr int tws = elems_per_thread;
-  static constexpr int block_work_size = elems_per_thread * num_threads;
-  data_t data;
-
-  __device__ vectorized_templated(data_t data) : data(data) {}
-
-  __device__ inline constexpr bool check_inbounds(int thread_work_elem) {
-    return true;
-  }
-
-  template <int arg_index, typename accessor_t>
-  __device__ inline void load_single_arg(accessor_t to, char* ptr, int idx) {
-    // extract the arg_index-th input tensor element type from the
-    // variadic template argument.
-    using CastFromT =
-        std::tuple_element_t<arg_index, std::tuple<CastFromTs...>>;
-    // Delayed pointer arithmetic from the caller: this is the place
-    // where we know the type of the argument.
-    CastFromT* block_ptr =
-        reinterpret_cast<CastFromT*>(ptr) + block_work_size * idx;
-    int thread_idx = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < loop_size; i++) {
-      int index = thread_idx + i * num_threads;
-      auto v = load_vector<vec_size>(block_ptr, index);
-#pragma unroll
-      for (int j = 0; j < vec_size; j++) {
-        to(vec_size * i + j) = c10::convert<CastToT>(v.val[j]);
-      }
-    }
-  }
-
-  template <typename args_t>
-  __device__ inline void load(args_t* args, int idx) {
-    constexpr int arity = std::tuple_size<args_t>::value;
-    detail::static_unroll<detail::vectorized_templated_load_helper, arity>::
-        with_args(*this, args, idx);
-  }
-
-  // Assume for now that from (temporary array per thread) is of the same
-  // type as to (destination tensor), which is the case for
-  // float(float,bfloat16) and functor add on float(float,float).
-  template <typename scalar_t>
-  __device__ inline void store(scalar_t* from, int idx) {
-    using vec_t = aligned_vector<scalar_t, vec_size>;
-    scalar_t* to = reinterpret_cast<scalar_t*>(data[0]) + block_work_size * idx;
-    vec_t* to_ = reinterpret_cast<vec_t*>(to);
-    int thread_idx = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < loop_size; i++) {
-      int index = thread_idx + i * num_threads;
-      vec_t v;
-      for (int j = 0; j < vec_size; j++) {
-        v.val[j] = from[vec_size * i + j];
-      }
-      to_[index] = v;
-    }
-  }
-};
-#endif
-
 template <typename data_t, typename inp_calc_t, typename out_calc_t, int num_outputs>
 struct multi_outputs_unroll {
  //multi_outputs_unroll struct members and check_inbounds and load methods are copypasted from unroll struct
--- a/aten/src/ATen/native/cuda/SoftMax.cu
+++ b/aten/src/ATen/native/cuda/SoftMax.cu
@ -89,20 +89,6 @@ struct SoftMaxBackwardEpilogue {
  const AccumT sum;
 };

-template<typename T, typename AccumT, typename OutT>
- struct SoftMaxForwardWithMulEpilogue {
-   __device__ __forceinline__ SoftMaxForwardWithMulEpilogue(AccumT max_input, AccumT sum)
-     : max_input(max_input)
-     , sum(sum) {}
-
-   __device__ __forceinline__ OutT operator()(T input) const {
-     return static_cast<OutT>(__expf(input - max_input) * sum);
-   }
-
-   const AccumT max_input;
-   const AccumT sum;
- };
-



@ -401,19 +387,6 @@ struct SumExpFloat
  const AccumT max_k;
 };

-template<typename T, typename AccumT>
-struct SumExpfFloat
-{
-  __device__ __forceinline__ SumExpfFloat(AccumT v)
-    : max_k(v) {}
-
-  __device__ __forceinline__ AccumT operator()(AccumT sum, T v) const {
-    return sum + __expf(v - max_k);
-  }
-
-  const AccumT max_k;
-};
-
 template <template<typename> class Reduction, typename AccumT>
 __device__ __forceinline__ AccumT
 blockReduce(AccumT* smem, AccumT val,
@ -476,19 +449,6 @@ T blockReduceWarp(T* smem_cache, T value, const Reduction<T>& op, T defaultVal)
  return smem_cache[0];
 }

-
-template <template<typename> class Reduction, typename T>
-__device__ __forceinline__
-T blockReduceWarpInverse(T* smem_cache, T value, const Reduction<T>& op, T defaultVal)
-{
-  T result = cuda_utils::BlockReduce<T, Reduction<T>>(value, op, defaultVal, smem_cache);
-  if (threadIdx.x == 0) {
-    smem_cache[0] = 1 / result;
-  }
-  __syncthreads();
-  return smem_cache[0];
-}
-
 template <template<typename, typename> class Reduction, int ILP, typename T, typename AccumT, typename index_t=int>
 __device__ __forceinline__ AccumT
 ilpReduce(index_t shift,
@ -704,38 +664,6 @@ WriteBpropResults(
  }
 }

-template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t, template <typename, typename, typename> class EpilogueWithMul>
-__global__ void
-cunn_SoftMaxForwardFast(outscalar_t *output, const scalar_t *input, int classes)
-{
-  extern __shared__ unsigned char smem[];
-  auto sdata = reinterpret_cast<accscalar_t*>(smem);
-
-  // each block handles a sample in the mini-batch
-  input += static_cast<int64_t>(blockIdx.x) * classes;
-  output += static_cast<int64_t>(blockIdx.x) * classes;
-
-  const int shift = ((uint64_t)input) % ALIGN_BYTES / sizeof(scalar_t);
-
-  // find the max
-  accscalar_t threadMax = ilpReduce<MaxFloat, ILP, scalar_t, accscalar_t>(
-    shift, input, classes, MaxFloat<scalar_t, accscalar_t>(), -at::numeric_limits<accscalar_t>::max());
-  accscalar_t max_k = blockReduceWarp<Max, accscalar_t>(sdata, threadMax,
-    Max<accscalar_t>(), -at::numeric_limits<accscalar_t>::max());
-
-  // reduce all values
-  accscalar_t threadExp = ilpReduce<SumExpfFloat, ILP, scalar_t, accscalar_t>(
-    shift, input, classes, SumExpfFloat<scalar_t, accscalar_t>(max_k), static_cast<accscalar_t>(0));
-  accscalar_t sumAll = blockReduceWarpInverse<Add, accscalar_t>(sdata, threadExp,
-    Add<accscalar_t>(), static_cast<accscalar_t>(0));
-
-  EpilogueWithMul<scalar_t, accscalar_t, outscalar_t> epilogue(max_k, sumAll);
-
-  for (int offset = threadIdx.x; offset < classes; offset += blockDim.x) {
-    output[offset] = epilogue(input[offset]);
-  }
-}
-
 template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t, template <typename, typename, typename> class Epilogue>
 __global__ void
 cunn_SoftMaxForward(outscalar_t *output, const scalar_t *input, int classes)
@ -827,68 +755,6 @@ cunn_SoftMaxForwardReg(outscalar_t *output, const scalar_t *input, index_t class
  }
 }

-
-template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t,
-  template <typename, typename, typename> class EpilogueWithMul, typename index_t = int32_t>
-__global__ void
-cunn_SoftMaxForwardGmem(outscalar_t *output, const scalar_t *input, index_t classes)
-{
-  // Each thread block processes a sample in the batch
-  input += static_cast<int64_t>(blockIdx.x) * classes;
-  output += static_cast<int64_t>(blockIdx.x) * classes;
-
-  accscalar_t threadMax = -at::numeric_limits<accscalar_t>::max();
-  accscalar_t threadExp = static_cast<accscalar_t>(0);
-
-  // The first smem segment is used to cache input values and the last
-  // segment is used for thread block reductions
-  extern __shared__ unsigned char smem[];
-  auto smem_reduction_cache = reinterpret_cast<accscalar_t*>(smem);
-
-  using LoadT = at::native::memory::aligned_vector<scalar_t, ILP>;
-  const LoadT* const input_vec_ptr = reinterpret_cast<const LoadT*>(input);
-
-  // Do the first step in max calculation:
-  MaxFloat<scalar_t, accscalar_t> maxFunc;
-  for (index_t offset = threadIdx.x; offset * ILP < classes; offset += blockDim.x) {
-    LoadT crnt_vec = input_vec_ptr[offset];
-    #pragma unroll
-    for (int i = 0; i < ILP; ++i) {
-      threadMax = maxFunc(threadMax, crnt_vec.val[i]);
-    }
-  }
-
-  accscalar_t max_k = blockReduceWarp<Max, accscalar_t>(smem_reduction_cache, threadMax,
-    Max<accscalar_t>(), -at::numeric_limits<accscalar_t>::max());
-
-  // Do the second step in sum exp calculation:
-  SumExpfFloat<scalar_t, accscalar_t> sumExpFunc(max_k);
-  for (index_t offset = threadIdx.x; offset * ILP < classes; offset += blockDim.x) {
-    LoadT crnt_vec = input_vec_ptr[offset];
-    #pragma unroll
-    for (int i = 0; i < ILP; ++i) {
-      threadExp = sumExpFunc(threadExp, crnt_vec.val[i]);
-    }
-  }
-
-  accscalar_t sumAll = blockReduceWarpInverse<Add, accscalar_t>(smem_reduction_cache, threadExp,
-    Add<accscalar_t>(), static_cast<accscalar_t>(0));
-
-  EpilogueWithMul<scalar_t, accscalar_t, outscalar_t> epilogue(max_k, sumAll);
-
-  using StoreT = at::native::memory::aligned_vector<outscalar_t, ILP>;
-  StoreT* output_vec_ptr = reinterpret_cast<StoreT*>(output);
-  for (index_t offset = threadIdx.x; offset * ILP < classes; offset += blockDim.x) {
-    LoadT crnt_vec = input_vec_ptr[offset];
-    StoreT out_vec;
-    #pragma unroll
-    for (int i = 0; i < ILP; ++i) {
-      out_vec.val[i] = epilogue(crnt_vec.val[i]);
-    }
-    output_vec_ptr[offset] = out_vec;
-  }
-}
-
 template <int ILP, typename scalar_t, typename accscalar_t, typename outscalar_t,
  template <typename, typename, typename> class Epilogue, typename index_t = int32_t>
 __global__ void
@ -1069,9 +935,7 @@ cunn_SoftMaxBackwardSmem(scalar_t *gradInput, const outscalar_t *output, const o
  }
 }

-
- template<template<typename, typename, typename> class Epilogue,
-          template<typename, typename, typename> class EpilogueWithMul, bool is_log_softmax, bool use_fast_softmax>
+template<template<typename, typename, typename> class Epilogue, bool is_log_softmax>
 Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_to_float, const Tensor& output){
  if (half_to_float) {
    TORCH_CHECK(input_.scalar_type() == ScalarType::Half, "conversion is supported for Half type only");
@ -1113,78 +977,66 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
            }
          } else {
            constexpr int ILP = sizeof(float4) / sizeof(scalar_t);
-            if constexpr (use_fast_softmax) {
-              dim3 block(512);
-              size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
-              if (dim_size % ILP == 0) {
-                cunn_SoftMaxForwardGmem<ILP, scalar_t, accscalar_t, scalar_t, EpilogueWithMul>
+            dim3 block = SoftMaxForward_getBlockSize(dim_size);
+            size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
+            auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
+              smem_reduction_sz) / sizeof(scalar_t);
+
+            bool can_use_smem = static_cast<size_t>(dim_size) < max_elements_per_smem;
+            can_use_smem &= !(reinterpret_cast<uintptr_t>(input_ptr) % ALIGN_BYTES);
+            can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES));
+            can_use_smem &= !(dim_size % ILP);
+
+            int32_t potential_reg_cnt = potential_register_count(dim_size, block.x);
+            if(potential_reg_cnt < 10){
+              TORCH_INTERNAL_ASSERT(potential_reg_cnt > 0, "potential_reg_cnt for softmax with register should be greater than 0.");
+              switch (potential_reg_cnt) {
+                // TODO(Wenqin): try to investigate why we couldn't use macro for below code,
+                // because it seems on MSVS, it seems the macro way didn't expand correct.
+                case 1:
+                  cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 1>
                    <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
-              } else {
-                cunn_SoftMaxForwardFast<ILP, scalar_t, accscalar_t, scalar_t, EpilogueWithMul>
+                  break;
+                case 2:
+                  cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 2>
                    <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
+                  break;
+                case 3:
+                  cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 3>
+                    <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
+                  break;
+                case 4:
+                  cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 4>
+                    <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
+                  break;
+                case 5:
+                  cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 5>
+                    <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
+                  break;
+                case 6:
+                  cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 6>
+                    <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
+                  break;
+                case 7:
+                  cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 7>
+                    <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
+                  break;
+                case 8:
+                  cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 8>
+                    <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
+                  break;
+                case 9:
+                  cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 9>
+                    <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
+                  break;
              }
+            } else if (can_use_smem) {
+              size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz;
+              cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, scalar_t, Epilogue>
+                <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size);
            } else {
-              dim3 block = SoftMaxForward_getBlockSize(dim_size);
-              size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
-              auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
-                smem_reduction_sz) / sizeof(scalar_t);
-
-              bool can_use_smem = static_cast<size_t>(dim_size) < max_elements_per_smem;
-              can_use_smem &= !(reinterpret_cast<uintptr_t>(input_ptr) % ALIGN_BYTES);
-              can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES));
-              can_use_smem &= !(dim_size % ILP);
-
-              int32_t potential_reg_cnt = potential_register_count(dim_size, block.x);
-              if(potential_reg_cnt < 10){
-                TORCH_INTERNAL_ASSERT(potential_reg_cnt > 0, "potential_reg_cnt for softmax with register should be greater than 0.");
-                switch (potential_reg_cnt) {
-                  // TODO(Wenqin): try to investigate why we couldn't use macro for below code,
-                  // because it seems on MSVS, it seems the macro way didn't expand correct.
-                  case 1:
-                    cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 1>
-                      <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
-                    break;
-                  case 2:
-                    cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 2>
-                      <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
-                    break;
-                  case 3:
-                    cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 3>
-                      <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
-                    break;
-                  case 4:
-                    cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 4>
-                      <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
-                    break;
-                  case 5:
-                    cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 5>
-                      <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
-                    break;
-                  case 6:
-                    cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 6>
-                      <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
-                    break;
-                  case 7:
-                    cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 7>
-                      <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
-                    break;
-                  case 8:
-                    cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 8>
-                      <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
-                    break;
-                  case 9:
-                    cunn_SoftMaxForwardReg<scalar_t, accscalar_t, scalar_t, Epilogue, int64_t, 9>
-                      <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
-                    break;
-                }
-              } else if (can_use_smem) {
-                size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz;
-                cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, scalar_t, Epilogue>
-                  <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size);
-              } else {
-                cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, scalar_t, Epilogue>
-                  <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
-              }
+              cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, scalar_t, Epilogue>
+                <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
            }

            C10_CUDA_KERNEL_LAUNCH_CHECK();
@ -1204,35 +1056,23 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
            }
          } else {
            constexpr int ILP = sizeof(float4) / sizeof(scalar_t);
-            if constexpr (use_fast_softmax) {
-              dim3 block(512);
-              size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
-              if (dim_size % ILP == 0) {
-                cunn_SoftMaxForwardGmem<ILP, scalar_t, accscalar_t, accscalar_t, EpilogueWithMul>
-                    <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
-              } else {
-                cunn_SoftMaxForwardFast<ILP, scalar_t, accscalar_t, accscalar_t, EpilogueWithMul>
-                    <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
-              }
+            dim3 block = SoftMaxForward_getBlockSize(dim_size);
+            size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
+            auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
+              smem_reduction_sz) / sizeof(scalar_t);
+
+            bool can_use_smem = static_cast<size_t>(dim_size) < max_elements_per_smem;
+            can_use_smem &= !(reinterpret_cast<uintptr_t>(input_ptr) % ALIGN_BYTES);
+            can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES));
+            can_use_smem &= !(dim_size % ILP);
+
+            if (can_use_smem) {
+              size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz;
+              cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue>
+                <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size);
            } else {
-              dim3 block = SoftMaxForward_getBlockSize(dim_size);
-              size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
-              auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
-                smem_reduction_sz) / sizeof(scalar_t);
-
-              bool can_use_smem = static_cast<size_t>(dim_size) < max_elements_per_smem;
-              can_use_smem &= !(reinterpret_cast<uintptr_t>(input_ptr) % ALIGN_BYTES);
-              can_use_smem &= (!(reinterpret_cast<uintptr_t>(output_ptr) % ALIGN_BYTES));
-              can_use_smem &= !(dim_size % ILP);
-
-              if (can_use_smem) {
-                size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz;
-                cunn_SoftMaxForwardSmem<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue>
-                  <<<grid, block, smem_sz, stream>>>(output_ptr, input_ptr, dim_size);
-              } else {
-                cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue>
-                  <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
-              }
+              cunn_SoftMaxForward<ILP, scalar_t, accscalar_t, accscalar_t, Epilogue>
+                <<<grid, block, smem_reduction_sz, stream>>>(output_ptr, input_ptr, dim_size);
            }

            C10_CUDA_KERNEL_LAUNCH_CHECK();
@ -1412,7 +1252,7 @@ TORCH_IMPL_FUNC(log_softmax_cuda_out) (
  const int64_t dim,
  const bool half_to_float,
  const Tensor &output) {
-  host_softmax<LogSoftMaxForwardEpilogue, LogSoftMaxForwardEpilogue, true, false>(input, dim, half_to_float, output);
+  host_softmax<LogSoftMaxForwardEpilogue,true>(input, dim, half_to_float, output);
 }

 TORCH_IMPL_FUNC(log_softmax_backward_cuda_out) (
@ -1436,11 +1276,7 @@ TORCH_IMPL_FUNC(softmax_cuda_out) (
  const int64_t dim,
  const bool half_to_float,
  const Tensor &output) {
-#if defined(USE_ROCM)
-   host_softmax<SoftMaxForwardEpilogue, SoftMaxForwardWithMulEpilogue, false, true>(input, dim, half_to_float, output);
- #else
-   host_softmax<SoftMaxForwardEpilogue, SoftMaxForwardWithMulEpilogue, false, false>(input, dim, half_to_float, output);
- #endif
+  host_softmax<SoftMaxForwardEpilogue,false>(input, dim, half_to_float, output);
 }

 TORCH_IMPL_FUNC(softmax_backward_cuda_out)
--- a/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip
+++ b/aten/src/ATen/native/hip/ck_gemm_bfloat16.hip
@ -469,315 +469,11 @@ void dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
  }
 }

-void dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
-  // If any of the shapes cant be tiled, we must use padding.
-  bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
-  // Dispatch to best implementation.
-  // TODO add more configurations. Optimize.
-
-  bool transa_ = std::tolower(transa) != 'n';
-  bool transb_ = std::tolower(transb) != 'n';
-
-  if (use_padding) {
-      if(transa_ && transb_) { // col , col
-          gemm_impl_wmma<
-            at::BFloat16,
-            256,
-            128,
-            256,
-            64,
-            8,
-            16,
-            16,
-            4,
-            4,
-            S<4, 64, 1>,
-            S<0, 2, 1>,
-            S<0, 2, 1>,
-            1,
-            1,
-            8,
-            true,
-            S<4, 64, 1>,
-            S<1, 0, 2>,
-            S<1, 0, 2>,
-            2,
-            8,
-            8,
-            true,
-            1,
-            1,
-            S<1, 32, 1,  8>,
-            8,
-            true,
-            true,
-            true>
-            (CUDABLAS_GEMM_ARGS(at::BFloat16));
-      }
-      else if(transa_ && !transb_) { // row, col
-          gemm_impl_wmma<
-            at::BFloat16,
-            256,
-            128,
-            256,
-            64,
-            8,
-            16,
-            16,
-            4,
-            4,
-            S<4, 64, 1>,
-            S<1, 0, 2>,
-            S<1, 0, 2>,
-            2,
-            8,
-            8,
-            true,
-            S<4, 64, 1>,
-            S<1, 0, 2>,
-            S<1, 0, 2>,
-            2,
-            8,
-            8,
-            true,
-            1,
-            1,
-            S<1, 32, 1,  8>,
-            8,
-            true,
-            true,
-            false>
-            (CUDABLAS_GEMM_ARGS(at::BFloat16));
-      }
-      else if(!transa_ && transb_) { //col, row
-          gemm_impl_wmma<
-            at::BFloat16,
-            256,
-            128,
-            256,
-            64,
-            8,
-            16,
-            16,
-            4,
-            4,
-            S<4, 64, 1>,
-            S<0, 2, 1>,
-            S<0, 2, 1>,
-            1,
-            1,
-            8,
-            true,
-            S<4, 64, 1>,
-            S<0, 2, 1>,
-            S<0, 2, 1>,
-            1,
-            1,
-            8,
-            true,
-            1,
-            1,
-            S<1, 32, 1,  8>,
-            8,
-            true,
-            false,
-            true>
-            (CUDABLAS_GEMM_ARGS(at::BFloat16));
-      }
-      else if(!transa_ && !transb_) { //row, row
-          gemm_impl_wmma<
-            at::BFloat16,
-            256,
-            128,
-            256,
-            64,
-            8,
-            16,
-            16,
-            4,
-            4,
-            S<4, 64, 1>,
-            S<1, 0, 2>,
-            S<1, 0, 2>,
-            2,
-            8,
-            8,
-            true,
-            S<4, 64, 1>,
-            S<0, 2, 1>,
-            S<0, 2, 1>,
-            1,
-            1,
-            8,
-            true,
-            1,
-            1,
-            S<1, 32, 1,  8>,
-            8,
-            true,
-            false,
-            false>
-            (CUDABLAS_GEMM_ARGS(at::BFloat16));
-      }
-      else {
-        TORCH_CHECK(false, "unreachable");
-      }
-  } else {
-         if(transa_ && transb_) { // col , col
-          gemm_impl_wmma<
-            at::BFloat16,
-            256,
-            128,
-            256,
-            64,
-            8,
-            16,
-            16,
-            4,
-            4,
-            S<4, 64, 1>,
-            S<0, 2, 1>,
-            S<0, 2, 1>,
-            1,
-            1,
-            8,
-            true,
-            S<4, 64, 1>,
-            S<1, 0, 2>,
-            S<1, 0, 2>,
-            2,
-            8,
-            8,
-            true,
-            1,
-            1,
-            S<1, 32, 1,  8>,
-            8,
-            false,
-            true,
-            true>
-            (CUDABLAS_GEMM_ARGS(at::BFloat16));
-      }
-      else if(transa_ && !transb_) { // row, col
-          gemm_impl_wmma<
-            at::BFloat16,
-            256,
-            128,
-            256,
-            64,
-            8,
-            16,
-            16,
-            4,
-            4,
-            S<4, 64, 1>,
-            S<1, 0, 2>,
-            S<1, 0, 2>,
-            2,
-            8,
-            8,
-            true,
-            S<4, 64, 1>,
-            S<1, 0, 2>,
-            S<1, 0, 2>,
-            2,
-            8,
-            8,
-            true,
-            1,
-            1,
-            S<1, 32, 1,  8>,
-            8,
-            false,
-            true,
-            false>
-            (CUDABLAS_GEMM_ARGS(at::BFloat16));
-      }
-      else if(!transa_ && transb_) { //col, row
-          gemm_impl_wmma<
-            at::BFloat16,
-            256,
-            128,
-            256,
-            64,
-            8,
-            16,
-            16,
-            4,
-            4,
-            S<4, 64, 1>,
-            S<0, 2, 1>,
-            S<0, 2, 1>,
-            1,
-            1,
-            8,
-            true,
-            S<4, 64, 1>,
-            S<0, 2, 1>,
-            S<0, 2, 1>,
-            1,
-            1,
-            8,
-            true,
-            1,
-            1,
-            S<1, 32, 1,  8>,
-            8,
-            false,
-            false,
-            true>
-            (CUDABLAS_GEMM_ARGS(at::BFloat16));
-      }
-      else if(!transa_ && !transb_) { //row, row
-          gemm_impl_wmma<
-            at::BFloat16,
-            256,
-            128,
-            256,
-            64,
-            8,
-            16,
-            16,
-            4,
-            4,
-            S<4, 64, 1>,
-            S<1, 0, 2>,
-            S<1, 0, 2>,
-            2,
-            8,
-            8,
-            true,
-            S<4, 64, 1>,
-            S<0, 2, 1>,
-            S<0, 2, 1>,
-            1,
-            1,
-            8,
-            true,
-            1,
-            1,
-            S<1, 32, 1,  8>, 8,
-            false,
-            false,
-            false>
-            (CUDABLAS_GEMM_ARGS(at::BFloat16));
-      }
-      else {
-        TORCH_CHECK(false, "unreachable");
-      }
-  }
-}


 template <>
 void gemm_internal_ck<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
-  auto dprops = at::cuda::getCurrentDeviceProperties();
-  c10::string_view arch(dprops->gcnArchName);
-  if (arch == "gfx1100") {
-    dispatch_bfloat16_gemm_wmma(CUDABLAS_GEMM_ARGS(at::BFloat16));
-  } else{
-    dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGS(at::BFloat16));
-  }
+  dispatch_bfloat16_gemm(CUDABLAS_GEMM_ARGS(at::BFloat16));
 }

 } // namespace at::native
--- a/aten/src/ATen/native/hip/ck_gemm_half.hip
+++ b/aten/src/ATen/native/hip/ck_gemm_half.hip
@ -297,314 +297,10 @@ void dispatch_half_gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
  }
 #endif
 }
-void dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
-  // If any of the shapes cant be tiled, we must use padding.
-  bool use_padding = ((m % 256 != 0) || (n % 128 != 0) || (k % 64 != 0));
-  // Dispatch to best implementation.
-  // TODO add more configurations. Optimize.
-
-  bool transa_ = std::tolower(transa) != 'n';
-  bool transb_ = std::tolower(transb) != 'n';
-
-  if (use_padding) {
-      if(transa_ && transb_) { // col , col
-          gemm_impl_wmma<
-            at::Half,
-            256,
-            128,
-            256,
-            64,
-            8,
-            16,
-            16,
-            4,
-            4,
-            S<4, 64, 1>,
-            S<0, 2, 1>,
-            S<0, 2, 1>,
-            1,
-            1,
-            8,
-            true,
-            S<4, 64, 1>,
-            S<1, 0, 2>,
-            S<1, 0, 2>,
-            2,
-            8,
-            8,
-            true,
-            1,
-            1,
-            S<1, 32, 1,  8>,
-            8,
-            true,
-            true,
-            true>
-            (CUDABLAS_GEMM_ARGS(at::Half));
-      }
-      else if(transa_ && !transb_) { // row, col
-          gemm_impl_wmma<
-            at::Half,
-            256,
-            128,
-            256,
-            64,
-            8,
-            16,
-            16,
-            4,
-            4,
-            S<4, 64, 1>,
-            S<1, 0, 2>,
-            S<1, 0, 2>,
-            2,
-            8,
-            8,
-            true,
-            S<4, 64, 1>,
-            S<1, 0, 2>,
-            S<1, 0, 2>,
-            2,
-            8,
-            8,
-            true,
-            1,
-            1,
-            S<1, 32, 1,  8>,
-            8,
-            true,
-            true,
-            false>
-            (CUDABLAS_GEMM_ARGS(at::Half));
-      }
-      else if(!transa_ && transb_) { //col, row
-          gemm_impl_wmma<
-            at::Half,
-            256,
-            128,
-            256,
-            64,
-            8,
-            16,
-            16,
-            4,
-            4,
-            S<4, 64, 1>,
-            S<0, 2, 1>,
-            S<0, 2, 1>,
-            1,
-            1,
-            8,
-            true,
-            S<4, 64, 1>,
-            S<0, 2, 1>,
-            S<0, 2, 1>,
-            1,
-            1,
-            8,
-            true,
-            1,
-            1,
-            S<1, 32, 1,  8>,
-            8,
-            true,
-            false,
-            true>
-            (CUDABLAS_GEMM_ARGS(at::Half));
-      }
-      else if(!transa_ && !transb_) { //row, row
-          gemm_impl_wmma<
-            at::Half,
-            256,
-            128,
-            256,
-            64,
-            8,
-            16,
-            16,
-            4,
-            4,
-            S<4, 64, 1>,
-            S<1, 0, 2>,
-            S<1, 0, 2>,
-            2,
-            8,
-            8,
-            true,
-            S<4, 64, 1>,
-            S<0, 2, 1>,
-            S<0, 2, 1>,
-            1,
-            1,
-            8,
-            true,
-            1,
-            1,
-            S<1, 32, 1,  8>,
-            8,
-            true,
-            false,
-            false>
-            (CUDABLAS_GEMM_ARGS(at::Half));
-      }
-      else {
-        TORCH_CHECK(false, "unreachable");
-      }
-  } else {
-         if(transa_ && transb_) { // col , col
-          gemm_impl_wmma<
-            at::Half,
-            256,
-            128,
-            256,
-            64,
-            8,
-            16,
-            16,
-            4,
-            4,
-            S<4, 64, 1>,
-            S<0, 2, 1>,
-            S<0, 2, 1>,
-            1,
-            1,
-            8,
-            true,
-            S<4, 64, 1>,
-            S<1, 0, 2>,
-            S<1, 0, 2>,
-            2,
-            8,
-            8,
-            true,
-            1,
-            1,
-            S<1, 32, 1,  8>,
-            8,
-            false,
-            true,
-            true>
-            (CUDABLAS_GEMM_ARGS(at::Half));
-      }
-      else if(transa_ && !transb_) { // row, col
-          gemm_impl_wmma<
-            at::Half,
-            256,
-            128,
-            256,
-            64,
-            8,
-            16,
-            16,
-            4,
-            4,
-            S<4, 64, 1>,
-            S<1, 0, 2>,
-            S<1, 0, 2>,
-            2,
-            8,
-            8,
-            true,
-            S<4, 64, 1>,
-            S<1, 0, 2>,
-            S<1, 0, 2>,
-            2,
-            8,
-            8,
-            true,
-            1,
-            1,
-            S<1, 32, 1,  8>,
-            8,
-            false,
-            true,
-            false>
-            (CUDABLAS_GEMM_ARGS(at::Half));
-      }
-      else if(!transa_ && transb_) { //col, row
-          gemm_impl_wmma<
-            at::Half,
-            256,
-            128,
-            256,
-            64,
-            8,
-            16,
-            16,
-            4,
-            4,
-            S<4, 64, 1>,
-            S<0, 2, 1>,
-            S<0, 2, 1>,
-            1,
-            1,
-            8,
-            true,
-            S<4, 64, 1>,
-            S<0, 2, 1>,
-            S<0, 2, 1>,
-            1,
-            1,
-            8,
-            true,
-            1,
-            1,
-            S<1, 32, 1,  8>,
-            8,
-            false,
-            false,
-            true>
-            (CUDABLAS_GEMM_ARGS(at::Half));
-      }
-      else if(!transa_ && !transb_) { //row, row
-          gemm_impl_wmma<
-            at::Half,
-            256,
-            128,
-            256,
-            64,
-            8,
-            16,
-            16,
-            4,
-            4,
-            S<4, 64, 1>,
-            S<1, 0, 2>,
-            S<1, 0, 2>,
-            2,
-            8,
-            8,
-            true,
-            S<4, 64, 1>,
-            S<0, 2, 1>,
-            S<0, 2, 1>,
-            1,
-            1,
-            8,
-            true,
-            1,
-            1,
-            S<1, 32, 1,  8>, 8,
-            false,
-            false,
-            false>
-            (CUDABLAS_GEMM_ARGS(at::Half));
-      }
-      else {
-        TORCH_CHECK(false, "unreachable");
-      }
-  }
-}

 template <>
 void gemm_internal_ck<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
-  auto dprops = at::cuda::getCurrentDeviceProperties();
-  c10::string_view arch(dprops->gcnArchName);
-  if (arch == "gfx1100") {
-    dispatch_half_gemm_wmma(CUDABLAS_GEMM_ARGS(at::Half));
-  } else{
-    dispatch_half_gemm(CUDABLAS_GEMM_ARGS(at::Half));
-  }
+  dispatch_half_gemm(CUDABLAS_GEMM_ARGS(at::Half));
 }

 } // namespace at::native
--- a/aten/src/ATen/native/hip/ck_gemm_template.h
+++ b/aten/src/ATen/native/hip/ck_gemm_template.h
@ -30,7 +30,6 @@
 #include <ck/library/utility/literals.hpp>

 #include <ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp>
-#include <ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp>

 // Define commonly used types.
 template <ck::index_t... Is>
@ -237,180 +236,4 @@ void gemm_impl(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
 invoker.Run(argument, StreamConfig{stream, false});
 }

-
-template <
-    typename Dtype,
-    int BLOCK_SIZE,
-    int MBLOCK,
-    int NBLOCK,
-    int KBLOCK,
-    int K1,
-    int MPER_WMMA,
-    int NPER_WMMA,
-    int MPER_WAVE,
-    int NPER_WAVE,
-    typename ABLOCK_CLUSTER_LENS,
-    typename ABLOCK_CLUSTER_ORDER,
-    typename ABLOCK_SRC_ORDER,
-    int ABLOCK_VECTOR_DIM,
-    int ABLOCK_SCALAR_VEC,
-    int ABLOCK_SCALAR_VEC_K1,
-    bool ABLOCK_LDS_EXTRAM,
-    typename BBLOCK_CLUSTER_LENS,
-    typename BBLOCK_CLUSTER_ORDER,
-    typename BBLOCK_SRC_ORDER,
-    int BBLOCK_VECTOR_DIM,
-    int BBLOCK_SCALAR_VEC,
-    int BBLOCK_SCALAR_VEC_AK1,
-    bool BBLOCK_LDS_EXTRAN,
-    int CMPER_WAVE,
-    int CNPER_WAVE,
-    typename CBLOCK_CLUSTER_LENS,
-    int CNPER_BLOCK,
-    bool PADDING = false,
-    bool TRANSA = false,
-    bool TRANSB = false>
-void gemm_impl_wmma(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
-  // Get input information.
-  int M = m;
-  int N = n;
-  int K = k;
-
-  int StrideA = lda;
-  int StrideB = ldb;
-  int StrideC = ldc;
-
-  int KBatch = 1;
-
-  float falpha = alpha;
-  float fbeta = beta;
-
-  using ADataType = typename CkMathType<Dtype>::dtype;
-  using BDataType = typename CkMathType<Dtype>::dtype;
-  using CDataType = typename CkMathType<Dtype>::dtype;
-  using DDataType = typename CkMathType<Dtype>::dtype;
-
-  using AccDataType = float;
-  using CShuffleDataType = typename CkMathType<Dtype>::dtype;
-
-  using ALayout = typename CkTensorLayout<TRANSA, TRANSB>::a_layout;
-  using BLayout = typename CkTensorLayout<TRANSA, TRANSB>::b_layout;
-
-  using DLayout = Row;
-  using CLayout = Row;
-
-  using AElementOp = PassThrough;
-  using BElementOp = PassThrough;
-  using CElementOp = PassThrough;
-
-
-  static constexpr auto GemmDefault =
-      ck::tensor_operation::device::GemmSpecialization::Default;
-  static constexpr auto GemmMNKPadding =
-      ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-  static constexpr auto GemmSpec = PADDING ? GemmMNKPadding : GemmDefault;
-
-
-  using DeviceGemmInstance =
-            ck::tensor_operation::device::DeviceGemmWmma_CShuffle<ALayout,
-                                                                  BLayout,
-                                                                  CLayout,
-                                                                  ADataType,
-                                                                  BDataType,
-                                                                  CDataType,
-                                                                  AccDataType,
-                                                                  CShuffleDataType,
-                                                                  AElementOp,
-                                                                  BElementOp,
-                                                                  CElementOp,
-                                                                  GemmSpec,
-                                                                  1,   // NumPrefetch
-                                                                  BLOCK_SIZE,
-                                                                  MBLOCK,
-                                                                  NBLOCK,
-                                                                  KBLOCK,
-                                                                  K1,
-                                                                  MPER_WMMA,
-                                                                  NPER_WMMA,
-                                                                  MPER_WAVE,
-                                                                  NPER_WAVE,
-                                                                  ABLOCK_CLUSTER_LENS,
-                                                                  ABLOCK_CLUSTER_ORDER,
-                                                                  ABLOCK_SRC_ORDER,
-                                                                  ABLOCK_VECTOR_DIM,
-                                                                  ABLOCK_SCALAR_VEC,
-                                                                  ABLOCK_SCALAR_VEC_K1,
-                                                                  ABLOCK_LDS_EXTRAM,
-                                                                  BBLOCK_CLUSTER_LENS,
-                                                                  BBLOCK_CLUSTER_ORDER,
-                                                                  BBLOCK_SRC_ORDER,
-                                                                  BBLOCK_VECTOR_DIM,
-                                                                  BBLOCK_SCALAR_VEC,
-                                                                  BBLOCK_SCALAR_VEC_AK1,
-                                                                  BBLOCK_LDS_EXTRAN,
-                                                                  CMPER_WAVE,
-                                                                  CNPER_WAVE,
-                                                                  CBLOCK_CLUSTER_LENS,
-                                                                  CNPER_BLOCK>;
-
-  auto gemm = DeviceGemmInstance{};
-  auto invoker = gemm.MakeInvoker();
-
-  auto a_element_op = AElementOp{};
-  auto b_element_op = BElementOp{};
-  auto c_element_op = CElementOp{};
-
-
-  using DDataArrayType = std::array<const void*, 0>;
-  DDataArrayType DDataArray;
-
-  // We swap A and B inputs here as a temporary workaround
-  auto argument = gemm.MakeArgument(
-     reinterpret_cast<const ADataType*>(b),
-     reinterpret_cast<const BDataType*>(a),
-     reinterpret_cast<CDataType*>(c),
-     N,
-     M,
-     K,
-     StrideB,
-     StrideA,
-     StrideC,
-     b_element_op,
-     a_element_op,
-     c_element_op);
-
-
- if(!gemm.IsSupportedArgument(argument))
- {
-        printf("error shape = %d %d %d TRANSA=%d TRANSB=%d \n",
-                        n, m, k,TRANSA, TRANSB);
-
-        throw std::runtime_error(
-            "wrong! device_gemm with the specified compilation parameters does "
-            "not support this GEMM problem");
- }
-
-
- auto stream = at::cuda::getCurrentHIPStream().stream();
-#if 1
- invoker.Run(argument, StreamConfig{stream, false});
-#else
-  float ave_time = invoker.Run(argument, StreamConfig{stream, true});
-  std::size_t flop = std::size_t(2) * M * N * K;
-
-  std::size_t num_btype =
-              sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
-
-  float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-
-  float gb_per_sec = num_btype / 1.E6 / ave_time;
-
-  std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
-                          << gb_per_sec << " GB/s, " << N <<" " <<M<<" " << k <<" "
-                          << "stride: "<<StrideA <<" "<<StrideB <<" "<<StrideC <<" "
-                          <<  gemm.GetTypeString()
-                          << std::endl;
-#endif
-}
-
 } // namespace at::native
--- a/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp
@ -311,8 +311,9 @@ void gpu_float_sdpa(
    bool is_causal,
    float softmax_scale,
    const Tensor& output) {
-  auto& eng = GpuEngineManager::Instance().get_engine();
-  auto& strm = GpuStreamManager::Instance().get_stream();
+  auto eng = GpuEngineManager::Instance().get_engine(
+      {c10::kXPU, c10::xpu::current_device()});
+  auto strm = GpuStreamManager::Instance().get_stream();

  const auto get_tril_mask = [&]() {
    auto opts = query.options();
--- a/aten/src/ATen/native/mkldnn/xpu/detail/Attr.h
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/Attr.h
@ -338,7 +338,8 @@ class Attr {
    // [1, C, 1, 1], channel broadcast
    // [dst.shape], no broadcast and eltwise-wise binary operations on dst

-    auto& engine = GpuEngineManager::Instance().get_engine();
+    auto engine = GpuEngineManager::Instance().get_engine(
+        {c10::kXPU, c10::xpu::current_device()});
    for (size_t i = 0; i < ops_params_.size(); ++i) {
      kind_t kind = ops_params_[i].kind_;
      if (kind == kind_t::binary) {
--- a/aten/src/ATen/native/mkldnn/xpu/detail/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/Conv.cpp
@ -83,8 +83,9 @@ sycl::event convolution(
    int64_t groups,
    Attr& attr,
    const std::vector<sycl::event>& deps) {
-  auto& engine = GpuEngineManager::Instance().get_engine();
-  auto& stream = GpuStreamManager::Instance().get_stream();
+  auto engine = GpuEngineManager::Instance().get_engine(
+      {c10::kXPU, c10::xpu::current_device()});
+  auto stream = GpuStreamManager::Instance().get_stream();

  bool is_channels_last = use_channels_last_for_conv(src, weight);

@ -183,8 +184,9 @@ sycl::event convolution_backward_weights(
    IntArrayRef dilation,
    int64_t groups,
    const std::vector<sycl::event>& deps) {
-  auto& engine = GpuEngineManager::Instance().get_engine();
-  auto& stream = GpuStreamManager::Instance().get_stream();
+  auto engine = GpuEngineManager::Instance().get_engine(
+      {c10::kXPU, c10::xpu::current_device()});
+  auto stream = GpuStreamManager::Instance().get_stream();

  bool is_channels_last = use_channels_last_for_conv(src, diff_dst);

@ -290,8 +292,9 @@ sycl::event convolution_backward_data(
    int64_t groups,
    bool bias_defined,
    const std::vector<sycl::event>& deps) {
-  auto& engine = GpuEngineManager::Instance().get_engine();
-  auto& stream = GpuStreamManager::Instance().get_stream();
+  auto engine = GpuEngineManager::Instance().get_engine(
+      {c10::kXPU, c10::xpu::current_device()});
+  auto stream = GpuStreamManager::Instance().get_stream();

  bool is_channels_last = use_channels_last_for_conv(diff_dst, weight);

--- a/aten/src/ATen/native/mkldnn/xpu/detail/Deconv.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/Deconv.cpp
@ -158,8 +158,9 @@ sycl::event deconvolution(
    int64_t groups,
    Attr& attr,
    const std::vector<sycl::event>& deps) {
-  auto& engine = GpuEngineManager::Instance().get_engine();
-  auto& stream = GpuStreamManager::Instance().get_stream();
+  auto engine = GpuEngineManager::Instance().get_engine(
+      {c10::kXPU, c10::xpu::current_device()});
+  auto stream = GpuStreamManager::Instance().get_stream();

  bool is_channels_last_suggested = use_channels_last_for_conv(src, weight);

@ -248,8 +249,9 @@ sycl::event deconvolution_backward_data(
    int64_t groups,
    bool bias_defined,
    const std::vector<sycl::event>& deps) {
-  auto& engine = GpuEngineManager::Instance().get_engine();
-  auto& stream = GpuStreamManager::Instance().get_stream();
+  auto engine = GpuEngineManager::Instance().get_engine(
+      {c10::kXPU, c10::xpu::current_device()});
+  auto stream = GpuStreamManager::Instance().get_stream();

  bool is_channels_last_suggested =
      use_channels_last_for_conv(diff_dst, weight);
@ -345,8 +347,9 @@ sycl::event deconvolution_backward_weights(
    IntArrayRef dilation,
    int64_t groups,
    const std::vector<sycl::event>& deps) {
-  auto& engine = GpuEngineManager::Instance().get_engine();
-  auto& stream = GpuStreamManager::Instance().get_stream();
+  auto engine = GpuEngineManager::Instance().get_engine(
+      {c10::kXPU, c10::xpu::current_device()});
+  auto stream = GpuStreamManager::Instance().get_stream();

  bool is_channels_last_suggested = use_channels_last_for_conv(src, diff_dst);

--- a/aten/src/ATen/native/mkldnn/xpu/detail/Matmul.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/Matmul.cpp
@ -30,8 +30,9 @@ sycl::event matmul(
      "oneDNN input matrixes must have the same ranks");
  TORCH_CHECK(result.defined(), "oneDNN matmul result should be defined");

-  auto& engine = GpuEngineManager::Instance().get_engine();
-  auto& stream = GpuStreamManager::Instance().get_stream();
+  at::Device cur_device = at::Device(at::kXPU, c10::xpu::current_device());
+  auto engine = GpuEngineManager::Instance().get_engine(cur_device);
+  auto stream = GpuStreamManager::Instance().get_stream();

  at::Tensor m1 = mat1;
  at::Tensor m2 = mat2;
--- a/aten/src/ATen/native/mkldnn/xpu/detail/QConv.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/QConv.cpp
@ -107,8 +107,9 @@ at::Tensor quantized_convolution(
      output.defined(),
      "A valid output is required for quantized convolution.");

-  auto& engine = GpuEngineManager::Instance().get_engine();
-  auto& stream = GpuStreamManager::Instance().get_stream();
+  auto engine = GpuEngineManager::Instance().get_engine(
+      {c10::kXPU, c10::xpu::current_device()});
+  auto stream = GpuStreamManager::Instance().get_stream();

  // input tensors config
  dnnl::memory::dims src_dims = act.sizes().vec();
--- a/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp
@ -125,8 +125,9 @@ void quantized_matmul(
      attr);

  size_t dims = result.dim();
-  auto& engine = GpuEngineManager::Instance().get_engine();
-  auto& stream = GpuStreamManager::Instance().get_stream();
+  at::Device cur_device = at::Device(at::kXPU, c10::xpu::current_device());
+  auto engine = GpuEngineManager::Instance().get_engine(cur_device);
+  auto stream = GpuStreamManager::Instance().get_stream();

  at::Tensor m1 = is_onednn_matmul_strides(mat1) ? mat1 : mat1.contiguous();
  at::Tensor m2 = is_onednn_matmul_strides(mat2) ? mat2 : mat2.contiguous();
--- a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.cpp
@ -29,7 +29,8 @@ static inline void dnnl_delete(
 }

 GpuEngineManager::GpuEngineManager() {
-  c10::DeviceIndex device_count = c10::xpu::device_count_ensure_non_zero();
+  c10::DeviceIndex device_count = c10::xpu::device_count();
+  TORCH_INTERNAL_ASSERT(device_count > 0);
  for (const auto i : c10::irange(device_count)) {
    static dnnl::graph::allocator alloc =
        dnnl::graph::sycl_interop::make_allocator(dnnl_alloc, dnnl_delete);
--- a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.h
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.h
@ -25,15 +25,10 @@ bool set_onednn_verbose(int level);
 struct TORCH_XPU_API GpuEngineManager {
  static GpuEngineManager& Instance(); // Singleton

-  dnnl::engine& get_engine(
-      DeviceIndex device_index = c10::xpu::current_device()) {
-    c10::xpu::check_device_index(device_index);
-    return *engine_pool[device_index];
-  }
-
  dnnl::engine& get_engine(const Device& device) {
    TORCH_INTERNAL_ASSERT(device.type() == kXPU);
-    return get_engine(device.index());
+    TORCH_INTERNAL_ASSERT(device.index() < c10::xpu::device_count());
+    return *engine_pool[device.index()];
  }

  GpuEngineManager(GpuEngineManager const&) = delete;
@ -53,15 +48,16 @@ struct TORCH_XPU_API GpuEngineManager {
 struct TORCH_XPU_API GpuStreamManager {
  static GpuStreamManager& Instance(); // Singleton

-  dnnl::stream& get_stream(
-      DeviceIndex device_index = c10::xpu::current_device()) {
-    auto stream = c10::xpu::getCurrentXPUStream(device_index);
+  dnnl::stream get_stream() {
+    auto stream = c10::xpu::getCurrentXPUStream();
    auto priority = stream.priority();
+    auto device_index = stream.device_index();
    if (stream_pool[device_index][priority].find(stream) ==
        stream_pool[device_index][priority].end()) {
      stream_pool[device_index][priority][stream] =
          std::make_shared<dnnl::stream>(dnnl::sycl_interop::make_stream(
-              GpuEngineManager::Instance().get_engine(device_index),
+              GpuEngineManager::Instance().get_engine(
+                  {c10::kXPU, device_index}),
              stream.queue()));
    }
    return *stream_pool[device_index][priority][stream];
@ -74,7 +70,8 @@ struct TORCH_XPU_API GpuStreamManager {

 protected:
  GpuStreamManager() {
-    c10::DeviceIndex device_count = c10::xpu::device_count_ensure_non_zero();
+    c10::DeviceIndex device_count = c10::xpu::device_count();
+    TORCH_INTERNAL_ASSERT(device_count > 0);
    stream_pool.resize(device_count);
  }
  ~GpuStreamManager() = default;
--- a/aten/src/ATen/native/mps/MetalShaderLibrary.h
+++ b/aten/src/ATen/native/mps/MetalShaderLibrary.h
@ -133,10 +133,6 @@ class MetalShaderLibrary {
      TensorIteratorBase& iter,
      const std::string& name,
      std::optional<int64_t> extra = std::nullopt);
-  void exec_binary_kernel(
-      TensorIteratorBase& iter,
-      const std::string& name,
-      const bool supports_dense = true);

 protected:
  virtual MTLLibrary_t getLibrary();
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@ -1010,49 +1010,6 @@ void MetalShaderLibrary::exec_unary_kernel(TensorIteratorBase& iter,
  }
 }

-void MetalShaderLibrary::exec_binary_kernel(TensorIteratorBase& iter,
-                                            const std::string& name,
-                                            const bool supports_dense) {
-  TORCH_CHECK(iter.common_dtype() != at::kDouble, "float64 is not supported on MPS");
-
-  Tensor input = iter.input(0);
-  Tensor other = iter.input(1);
-  Tensor out = iter.output();
-
-  id<MTLDevice> device = MPSDevice::getInstance()->device();
-  MPSStream* mpsStream = getCurrentMPSStream();
-  const uint32_t nDim = iter.ndim();
-  constexpr uint32_t nOffsets = 3;
-  const uint32_t numThreads = iter.numel();
-  dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
-    @autoreleasepool {
-      auto computeEncoder = mpsStream->commandEncoder();
-      if (supports_dense && iter.is_contiguous()) {
-        const auto kernel_name = fmt::format("{}_dense_{}", name, scalarToMetalTypeString(input));
-        auto binaryPSO = getPipelineStateForFunc(kernel_name);
-        [computeEncoder setComputePipelineState:binaryPSO];
-        mtl_setArgs(computeEncoder, input, other, out);
-        mtl_dispatch1DJob(computeEncoder, binaryPSO, numThreads);
-        return;
-      }
-      const auto kernel = fmt::format("{}_{}", name, scalarToMetalTypeString(input));
-      auto kernelDataOffsets = generateKernelDataOffsets(computeEncoder, iter);
-
-      auto binaryPSO = getPipelineStateForFunc(kernel);
-
-      // this function call is a no-op if MPS Profiler is not enabled
-      getMPSProfiler().beginProfileKernel(binaryPSO, kernel, {input, other});
-
-      [computeEncoder setComputePipelineState:binaryPSO];
-      mtl_setArgs(computeEncoder, input, other, out);
-      [computeEncoder setBuffer:kernelDataOffsets offset:0 atIndex:3];
-      mtl_dispatch1DJob(computeEncoder, binaryPSO, numThreads);
-
-      getMPSProfiler().endProfileKernel(binaryPSO);
-    }
-  });
-}
-
 MetalShaderLibrary& MetalShaderLibrary::getBundledLibrary() {
  static BundledShaderLibary l;
  return l;
--- a/aten/src/ATen/native/mps/kernels/BinaryKernel.metal
+++ b/aten/src/ATen/native/mps/kernels/BinaryKernel.metal
@ -1,4 +1,3 @@
-#include <c10/metal/indexing.h>
 #include <c10/metal/special_math.h>
 #include <c10/metal/utils.h>
 #include <metal_stdlib>
@ -92,6 +91,59 @@ struct polar_functor {
  }
 };

+// Future BinaryTensorIterator
+template <typename T, typename F>
+using result_of = decltype(::metal::declval<F>()(
+    ::metal::declval<T>(),
+    ::metal::declval<T>()));
+
+template <typename T, typename F>
+kernel void binary_indexing(
+    constant void* input_ [[buffer(0)]],
+    constant void* other_ [[buffer(1)]],
+    device void* out_ [[buffer(2)]],
+    constant uint3* offsets [[buffer(3)]],
+    uint tid [[thread_position_in_grid]]) {
+  auto out = (device result_of<T, F>*)((device uint8_t*)out_ + offsets[tid].x);
+  auto input = (constant T*)((constant uint8_t*)input_ + offsets[tid].y);
+  auto other = (constant T*)((constant uint8_t*)other_ + offsets[tid].z);
+  F f;
+  *out = f(*input, *other);
+}
+
+template <typename T, typename F>
+kernel void binary_dense(
+    constant T* input [[buffer(0)]],
+    constant T* other [[buffer(1)]],
+    device result_of<T, F>* out [[buffer(2)]],
+    uint tid [[thread_position_in_grid]]) {
+  F f;
+  out[tid] = f(input[tid], other[tid]);
+}
+
+#define REGISTER_BINARY_INDEXING_OP(NAME, DTYPE)             \
+  template [[host_name(#NAME "_" #DTYPE)]] kernel void       \
+  binary_indexing<DTYPE, NAME##_functor>(                    \
+      constant void* input_,                                 \
+      constant void* other_,                                 \
+      device void* out_,                                     \
+      constant uint3* offsets,                               \
+      uint tid);                                             \
+  template [[host_name(#NAME "_dense_" #DTYPE)]] kernel void \
+  binary_dense<DTYPE, NAME##_functor>(                       \
+      constant DTYPE * input_,                               \
+      constant DTYPE * other_,                               \
+      device result_of<DTYPE, NAME##_functor> * out_,        \
+      uint tid)
+
+#define REGISTER_BINARY_OP(NAME, DTYPE)                             \
+  template [[host_name(#NAME "_" #DTYPE)]] kernel void NAME<DTYPE>( \
+      constant void* input_,                                        \
+      constant void* other_,                                        \
+      device void* out_,                                            \
+      constant uint3* offsets,                                      \
+      uint tid)
+
 REGISTER_BINARY_INDEXING_OP(copysign, long);
 REGISTER_BINARY_INDEXING_OP(copysign, int);
 REGISTER_BINARY_INDEXING_OP(copysign, float);
@ -138,7 +190,9 @@ kernel void complex_mul(
  out[1] = input[0] * other[1] + input[1] * other[0];
 }

-// Constructs complex tensor from real and imaginary planes
+REGISTER_BINARY_OP(complex_mul, float);
+REGISTER_BINARY_OP(complex_mul, half);
+
 template <typename T>
 kernel void complex_kernel(
    constant void* real_ [[buffer(0)]],
@ -153,15 +207,5 @@ kernel void complex_kernel(
  out[1] = imag[0];
 }

-#define REGISTER_BINARY_OP(NAME, DTYPE)                             \
-  template [[host_name(#NAME "_" #DTYPE)]] kernel void NAME<DTYPE>( \
-      constant void* input_,                                        \
-      constant void* other_,                                        \
-      device void* out_,                                            \
-      constant uint3* offsets,                                      \
-      uint tid)
-
-REGISTER_BINARY_OP(complex_mul, float);
-REGISTER_BINARY_OP(complex_mul, half);
 REGISTER_BINARY_OP(complex_kernel, float);
 REGISTER_BINARY_OP(complex_kernel, half);
--- a/aten/src/ATen/native/mps/kernels/SpecialOps.metal
+++ b/aten/src/ATen/native/mps/kernels/SpecialOps.metal
@ -1,63 +1,16 @@
 #include <c10/metal/indexing.h>
 #include <c10/metal/special_math.h>
 using namespace c10::metal;
-using namespace metal;

-DEFINE_UNARY_FLOATING_FUNCTOR(bessel_j0_forward);
-DEFINE_UNARY_FLOATING_FUNCTOR(bessel_j1_forward);
-DEFINE_UNARY_FLOATING_FUNCTOR(modified_bessel_i0_forward);
-DEFINE_UNARY_FLOATING_FUNCTOR(modified_bessel_i1_forward);
 DEFINE_UNARY_FLOATING_FUNCTOR(i0);
-DEFINE_UNARY_FLOATING_FUNCTOR(i0e);
 DEFINE_UNARY_FLOATING_FUNCTOR(i1);
-DEFINE_UNARY_FLOATING_FUNCTOR(i1e);
 DEFINE_UNARY_FLOATING_FUNCTOR(spherical_bessel_j0);
 DEFINE_UNARY_FLOATING_FUNCTOR(entr);

-// TODO: Replaceme with DEFINE_UNARY_FLOATING_FUNCTOR
-// But for some reason instantinating bessel_y[01] on M1/M2 results in
-// Failed to created pipeline state object, error: Error Domain=AGXMetalG14X
-// Code=3 "Compiler encountered an internal error"
-struct bessel_y0_forward_functor {
-  template <typename T>
-  inline enable_if_t<is_floating_point_v<T>, T> operator()(const T x) {
-    return static_cast<T>(bessel_y0_forward(x));
-  }
-  template <typename T>
-  inline enable_if_t<is_integral_v<T>, float> operator()(const T x) {
-    return bessel_y0_forward(static_cast<float>(x));
-  }
-  inline float operator()(const bool x) {
-    return x ? 0.08825694769620895 : -INFINITY;
-  }
-};
-
-struct bessel_y1_forward_functor {
-  template <typename T>
-  inline enable_if_t<is_floating_point_v<T>, T> operator()(const T x) {
-    return static_cast<T>(bessel_y1_forward(x));
-  }
-  template <typename T>
-  inline enable_if_t<is_integral_v<T>, float> operator()(const T x) {
-    return bessel_y1_forward(static_cast<float>(x));
-  }
-  inline float operator()(const bool x) {
-    return x ? -0.7812128067016602 : -INFINITY;
-  }
-};
-
-#define REGISTER_SPECIAL(DTI, DTO)                         \
-  REGISTER_UNARY_OP(bessel_j0_forward, DTI, DTO);          \
-  REGISTER_UNARY_OP(bessel_j1_forward, DTI, DTO);          \
-  REGISTER_UNARY_OP(modified_bessel_i0_forward, DTI, DTO); \
-  REGISTER_UNARY_OP(modified_bessel_i1_forward, DTI, DTO); \
-  REGISTER_UNARY_OP(bessel_y0_forward, DTI, DTO);          \
-  REGISTER_UNARY_OP(bessel_y1_forward, DTI, DTO);          \
-  REGISTER_UNARY_OP(i0, DTI, DTO);                         \
-  REGISTER_UNARY_OP(i0e, DTI, DTO);                        \
-  REGISTER_UNARY_OP(i1, DTI, DTO);                         \
-  REGISTER_UNARY_OP(i1e, DTI, DTO);                        \
-  REGISTER_UNARY_OP(spherical_bessel_j0, DTI, DTO);        \
+#define REGISTER_SPECIAL(DTI, DTO)                  \
+  REGISTER_UNARY_OP(i0, DTI, DTO);                  \
+  REGISTER_UNARY_OP(i1, DTI, DTO);                  \
+  REGISTER_UNARY_OP(spherical_bessel_j0, DTI, DTO); \
  REGISTER_UNARY_OP(entr, DTI, DTO)

 REGISTER_SPECIAL(float, float);
--- a/aten/src/ATen/native/mps/kernels/UpSample.metal
+++ b/aten/src/ATen/native/mps/kernels/UpSample.metal
@ -268,31 +268,12 @@ kernel void upsample_bilinear2d(
  }
 }

-struct BilinearFunctor {
-  inline float operator()(float x) {
-    x = abs(x);
-    return x < 1.0 ? 1.0 - x : x;
-  }
-  static constant constexpr float area_factor = 1.0;
-};
+inline float bilinear_functor(float x) {
+  return abs(x) < 1.0 ? 1.0 - abs(x) : abs(x);
+}

-struct BicubicFunctor {
-  inline float operator()(float x) {
-    // https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
-    x = abs(x);
-    if (x < 1.0) {
-      return 1.0 + (1.5 * x - 2.5) * x * x;
-    }
-    if (x < 2.0) {
-      return 2.0 - 0.5 * ((x - 5.0) * x + 8.0) * x;
-    }
-    return 0;
-  }
-  static constant constexpr float area_factor = 2.0;
-};
-
-template <typename T, typename F>
-kernel void upsample_2d_aa(
+template <typename T>
+kernel void upsample_bilinear2d_aa(
    constant T* inputData [[buffer(0)]],
    device T* outputData [[buffer(1)]],
    constant ulong4& input_strides [[buffer(2)]],
@ -305,26 +286,15 @@ kernel void upsample_2d_aa(
  auto output_x = thread_index % static_cast<uint>(output_sizes.w);
  auto output_y = thread_index / static_cast<uint>(output_sizes.w);
  (void)align_corners; // Align corners is unused for AA algorithm
-  F f;
  auto x_center = area_pixel_compute_source_index(
-      scales.x,
-      output_x,
-      /*align_corners=*/false,
-      /*cubic=*/F::area_factor == 2.0);
+      scales.x, output_x, /*align_corners=*/false, /*cubic=*/false);
  auto y_center = area_pixel_compute_source_index(
-      scales.y,
-      output_y,
-      /*align_corners=*/false,
-      /*cubic=*/F::area_factor == 2.0);
+      scales.y, output_y, /*align_corners=*/false, /*cubic=*/false);
  auto clamped_scales = max(1.0, scales);
-  auto x_min =
-      max(0L, long(floor(x_center - f.area_factor * clamped_scales.x + 1)));
-  auto x_max = min(
-      input_sizes.w, long(ceil(x_center + f.area_factor * clamped_scales.x)));
-  auto y_min =
-      max(0L, long(floor(y_center - f.area_factor * clamped_scales.y + 1)));
-  auto y_max = min(
-      input_sizes.z, long(ceil(y_center + f.area_factor * clamped_scales.y)));
+  auto x_min = max(0L, long(floor(x_center - clamped_scales.x + 1)));
+  auto x_max = min(input_sizes.w, long(ceil(x_center + clamped_scales.x)));
+  auto y_min = max(0L, long(floor(y_center - clamped_scales.y + 1)));
+  auto y_max = min(input_sizes.z, long(ceil(y_center + clamped_scales.y)));
  for (int n = 0; n < output_sizes.x; n++) {
    for (int c = 0; c < output_sizes.y; c++) {
      float res = 0.0;
@ -332,9 +302,9 @@ kernel void upsample_2d_aa(
      constant auto* input =
          inputData + n * input_strides.x + c * input_strides.y;
      for (auto y = y_min; y < y_max; ++y) {
-        auto dy = f((y - y_center) / clamped_scales.y);
+        auto dy = bilinear_functor((y - y_center) / clamped_scales.y);
        for (auto x = x_min; x < x_max; ++x) {
-          auto dx = f((x - x_center) / clamped_scales.x);
+          auto dx = bilinear_functor((x - x_center) / clamped_scales.x);
          auto val = input[x * input_strides.w + y * input_strides.z];
          res += val * dx * dy;
          ws += dx * dy;
@ -486,19 +456,6 @@ kernel void upsample_bicubic2d_backward(
          constant bool& align_corners [[buffer(7)]],              \
          uint thread_index [[thread_position_in_grid]])

-#define INSTANTIATE_UPSAMPLE_2D_AA(NAME, FUNCTOR, DTYPE)           \
-  template [[host_name("upsample_" #NAME "_" #DTYPE)]] kernel void \
-  upsample_2d_aa<DTYPE, FUNCTOR>(                                  \
-      constant DTYPE * inputData [[buffer(0)]],                    \
-      device DTYPE * outputData [[buffer(1)]],                     \
-      constant ulong4 & input_strides [[buffer(2)]],               \
-      constant ulong4 & output_strides [[buffer(3)]],              \
-      constant long4 & input_sizes [[buffer(4)]],                  \
-      constant long4 & output_sizes [[buffer(5)]],                 \
-      constant float2 & scales [[buffer(6)]],                      \
-      constant bool& align_corners [[buffer(7)]],                  \
-      uint thread_index [[thread_position_in_grid]])
-
 #define INSTANTIATE_UPSAMPLE_2D_BACKWARD(NAME, DTYPE)                       \
  template [[host_name("upsample_" #NAME "_backward_" #DTYPE)]] kernel void \
      upsample_##NAME##_backward<DTYPE>(                                    \
@ -525,12 +482,11 @@ kernel void upsample_bicubic2d_backward(
      constant bool& align_corners [[buffer(7)]],                 \
      uint thread_index [[thread_position_in_grid]])

-#define INSTANTIATE_UPSAMPLE_ALL(DTYPE)                              \
-  INSTANTIATE_UPSAMPLE_2D(bicubic2d, DTYPE);                         \
-  INSTANTIATE_UPSAMPLE_2D_AA(bicubic2d_aa, BicubicFunctor, DTYPE);   \
-  INSTANTIATE_UPSAMPLE_2D_BACKWARD(bicubic2d, DTYPE);                \
-  INSTANTIATE_UPSAMPLE_2D(bilinear2d, DTYPE);                        \
-  INSTANTIATE_UPSAMPLE_2D_AA(bilinear2d_aa, BilinearFunctor, DTYPE); \
+#define INSTANTIATE_UPSAMPLE_ALL(DTYPE)               \
+  INSTANTIATE_UPSAMPLE_2D(bicubic2d, DTYPE);          \
+  INSTANTIATE_UPSAMPLE_2D_BACKWARD(bicubic2d, DTYPE); \
+  INSTANTIATE_UPSAMPLE_2D(bilinear2d, DTYPE);         \
+  INSTANTIATE_UPSAMPLE_2D(bilinear2d_aa, DTYPE);      \
  INSTANTIATE_UPSAMPLE_LINEAR(DTYPE);

 INSTANTIATE_UPSAMPLE_2D(bilinear2d, uchar);
--- a/aten/src/ATen/native/mps/operations/Attention.mm
+++ b/aten/src/ATen/native/mps/operations/Attention.mm
@ -44,8 +44,7 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_attention_math_mps(const Tensor&
    TORCH_CHECK(!attn_mask.has_value(),
                "_scaled_dot_product_attention: Explicit attn_mask should not be set when is_causal=True");
  }
-  TORCH_CHECK(query.size(-3) == key.size(-3) && key.size(-3) == value.size(-3),
-              "number of heads in query/key/value should match");
+
  TORCH_CHECK(dropout_p == 0.0, "_scaled_dot_product_attention_math_for_mps: dropout_p != 0.0 is not supported");
  TORCH_CHECK(macOS15_0_plus || (query.is_contiguous() && key.is_contiguous() && value.is_contiguous()),
              "_scaled_dot_product_attention_math_for_mps: query, key, and value must be contiguous");
@ -56,7 +55,6 @@ std::tuple<Tensor, Tensor> _scaled_dot_product_attention_math_mps(const Tensor&
  auto [q_, sq] = ensure_4d(query);
  auto [k_, sk] = ensure_4d(key);
  auto [v_, sv] = ensure_4d(value);
-
  std::optional<Tensor> mask_;
  if (attn_mask) {
    auto maskExpandedDims = query.sizes().vec();
--- a/aten/src/ATen/native/mps/operations/BinaryKernel.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryKernel.mm
@ -23,13 +23,54 @@
 #endif

 namespace at::native {
+namespace mps {
+
 #ifndef PYTORCH_JIT_COMPILE_SHADERS
-static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
+static auto& lib = MetalShaderLibrary::getBundledLibrary();
 #else
 #include <ATen/native/mps/BinaryKernel_metallib.h>
 #endif

-namespace mps {
+static void binary_mps_impl(TensorIteratorBase& iter, const std::string func_name, bool supports_dense = true) {
+  TORCH_CHECK(iter.common_dtype() != at::kDouble, "float64 is not supported on MPS");
+
+  Tensor input = iter.input(0);
+  Tensor other = iter.input(1);
+  Tensor out = iter.output();
+
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  const uint32_t nDim = iter.ndim();
+  constexpr uint32_t nOffsets = 3;
+  const uint32_t numThreads = iter.numel();
+  dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      auto computeEncoder = mpsStream->commandEncoder();
+      if (supports_dense && iter.is_contiguous()) {
+        const auto kernel_name = fmt::format("{}_dense_{}", func_name, scalarToMetalTypeString(input));
+        auto binaryPSO = lib.getPipelineStateForFunc(kernel_name);
+        [computeEncoder setComputePipelineState:binaryPSO];
+        mtl_setArgs(computeEncoder, input, other, out);
+        mtl_dispatch1DJob(computeEncoder, binaryPSO, numThreads);
+        return;
+      }
+      const std::string kernel = func_name + "_" + scalarToMetalTypeString(input);
+      auto kernelDataOffsets = generateKernelDataOffsets(computeEncoder, iter);
+
+      id<MTLComputePipelineState> binaryPSO = lib.getPipelineStateForFunc(kernel);
+
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(binaryPSO, kernel, {input, other});
+
+      [computeEncoder setComputePipelineState:binaryPSO];
+      mtl_setArgs(computeEncoder, input, other, out);
+      [computeEncoder setBuffer:kernelDataOffsets offset:0 atIndex:3];
+      mtl_dispatch1DJob(computeEncoder, binaryPSO, numThreads);
+
+      getMPSProfiler().endProfileKernel(binaryPSO);
+    }
+  });
+}

 void complex_mul_out(const Tensor& input, const Tensor& other, const Tensor& output) {
  TORCH_INTERNAL_ASSERT(c10::isComplexType(input.scalar_type()) || c10::isComplexType(other.scalar_type()));
@ -48,43 +89,43 @@ void complex_mul_out(const Tensor& input, const Tensor& other, const Tensor& out
  auto iter =
      TensorIteratorConfig().add_output(output_as_real).add_input(input_as_real).add_input(other_as_real).build();

-  lib.exec_binary_kernel(iter, "complex_mul", /*supports_dense=*/false);
+  mps::binary_mps_impl(iter, "complex_mul", false);
 }

 } // namespace mps

 static void fmax_mps_kernel(TensorIteratorBase& iter) {
  if (isFloatingType(iter.common_dtype())) {
-    lib.exec_binary_kernel(iter, "fmax");
+    mps::binary_mps_impl(iter, "fmax");
  } else {
    at::maximum_out(const_cast<Tensor&>(iter.output()), iter.input(0), iter.input(1));
  }
 }
 static void fmin_mps_kernel(TensorIteratorBase& iter) {
  if (isFloatingType(iter.common_dtype())) {
-    lib.exec_binary_kernel(iter, "fmin");
+    mps::binary_mps_impl(iter, "fmin");
  } else {
    at::minimum_out(const_cast<Tensor&>(iter.output()), iter.input(0), iter.input(1));
  }
 }

 static void copysign_mps_kernel(TensorIteratorBase& iter) {
-  lib.exec_binary_kernel(iter, "copysign");
+  mps::binary_mps_impl(iter, "copysign");
 }

 static void nextafter_mps_kernel(TensorIteratorBase& iter) {
  TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()), "nextafter_mps not implemented for non-floating types");
-  lib.exec_binary_kernel(iter, "nextafter");
+  mps::binary_mps_impl(iter, "nextafter");
 }

 static void zeta_mps_kernel(TensorIteratorBase& iter) {
  TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()), "zeta_mps not implemented for non-floating types");
-  lib.exec_binary_kernel(iter, "zeta");
+  mps::binary_mps_impl(iter, "zeta");
 }

 static void xlog1py_mps_kernel(TensorIteratorBase& iter) {
  TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()), "xlog1py_mps not implemented for non-floating types");
-  lib.exec_binary_kernel(iter, "xlog1py");
+  mps::binary_mps_impl(iter, "xlog1py");
 }

 REGISTER_DISPATCH(fmax_stub, &fmax_mps_kernel)
@ -106,7 +147,7 @@ Tensor& polar_out_mps(const Tensor& abs, const Tensor& angle, Tensor& output) {
  auto output_as_real = at::view_as_real(output).select(output.dim(), 0);
  auto iter = TensorIteratorConfig().add_output(output_as_real).add_input(abs).add_input(angle).build();

-  lib.exec_binary_kernel(iter, "polar");
+  mps::binary_mps_impl(iter, "polar");
  return output;
 }

@ -122,7 +163,7 @@ Tensor& complex_out_mps(const Tensor& real, const Tensor& imag, Tensor& output)
  auto output_as_real = at::view_as_real(output).select(output.dim(), 0);
  auto iter = TensorIteratorConfig().add_output(output_as_real).add_input(real).add_input(imag).build();

-  lib.exec_binary_kernel(iter, "complex_kernel", /*supports_dense=*/false);
+  mps::binary_mps_impl(iter, "complex_kernel", false);
  return output;
 }
 } // namespace at::native
--- a/aten/src/ATen/native/mps/operations/BinaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryOps.mm
@ -14,6 +14,7 @@
 #include <ATen/ops/atan2_native.h>
 #include <ATen/ops/div_native.h>
 #include <ATen/ops/eq_native.h>
+#include <ATen/ops/floor_divide_native.h>
 #include <ATen/ops/fmod_native.h>
 #include <ATen/ops/ge_native.h>
 #include <ATen/ops/gt_native.h>
@ -446,8 +447,19 @@ TORCH_IMPL_FUNC(pow_Scalar_out_mps)(const Scalar& base, const Tensor& exp, const
  }
 }

-static void div_floor_kernel_mps(TensorIteratorBase& iter) {
-  mps::div_mode_template(iter.input(0), iter.input(1), "floor", iter.output(0), "floor_divide_out");
+Tensor& floor_divide_out_mps(const Tensor& self, const Tensor& other, Tensor& result) {
+  mps::div_mode_template(self, other, "floor", result, "floor_divide_out");
+  return result;
+}
+
+Tensor floor_divide_mps(const Tensor& self, const Tensor& other) {
+  Tensor output = at::empty_like(self);
+  mps::div_mode_template(self, other, "floor", output, "floor_divide");
+  return output;
+}
+
+Tensor& floor_divide_mps_(Tensor& self, const Tensor& other) {
+  return floor_divide_out_mps(self, other, self);
 }

 TORCH_IMPL_FUNC(remainder_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
@ -526,6 +538,4 @@ TORCH_IMPL_FUNC(xlogy_out_mps)(const Tensor& self, const Tensor& other, const Te
 TORCH_IMPL_FUNC(lerp_Scalar_mps)(const Tensor& self, const Tensor& end, const Scalar& weight, const Tensor& out) {
  mps::add_sub_lerp_template(self, end, weight, out, "lerp");
 }
-
-REGISTER_DISPATCH(div_floor_stub, &div_floor_kernel_mps);
 } // namespace at::native
--- a/aten/src/ATen/native/mps/operations/FusedSgdKernel.mm
+++ b/aten/src/ATen/native/mps/operations/FusedSgdKernel.mm
@ -60,25 +60,9 @@ static void _fused_sgd_with_momentum_kernel_mps_(TensorList params,
                                                 const bool is_first_step,
                                                 const std::optional<Tensor>& grad_scale,
                                                 const std::optional<Tensor>& found_inf) {
-  if (lr_tensor.is_cpu()) {
-    return _fused_sgd_with_momentum_kernel_mps_(params,
-                                                grads,
-                                                momentum_buffer_list,
-                                                weight_decay,
-                                                momentum,
-                                                lr_tensor.item<double>(),
-                                                dampening,
-                                                nesterov,
-                                                maximize,
-                                                is_first_step,
-                                                grad_scale,
-                                                found_inf);
-  }
  TORCH_CHECK_GT(momentum, 0);
  TORCH_CHECK(native::check_fast_path_restrictions({params, grads, momentum_buffer_list}));

-  TORCH_CHECK(lr_tensor.device() == params[0].device(), "lr must be on the same GPU device as the params");
-
  std::vector<std::vector<Tensor>> tensor_lists{params.vec(), grads.vec(), momentum_buffer_list.vec()};

  const auto kernel_name = "fused_sgd_momentum_" + scalarToMetalTypeString(params[0].scalar_type());
--- a/aten/src/ATen/native/mps/operations/SpecialOps.mm
+++ b/aten/src/ATen/native/mps/operations/SpecialOps.mm
@ -16,18 +16,10 @@ static void i0_kernel_mps(TensorIteratorBase& iter) {
  lib.exec_unary_kernel(iter, "i0");
 }

-static void i0e_kernel_mps(TensorIteratorBase& iter) {
-  lib.exec_unary_kernel(iter, "i0e");
-}
-
 static void i1_kernel_mps(TensorIteratorBase& iter) {
  lib.exec_unary_kernel(iter, "i1");
 }

-static void i1e_kernel_mps(TensorIteratorBase& iter) {
-  lib.exec_unary_kernel(iter, "i1e");
-}
-
 static void spherical_bessel_j0_kernel_mps(TensorIteratorBase& iter) {
  lib.exec_unary_kernel(iter, "spherical_bessel_j0");
 }
@ -36,40 +28,8 @@ static void entr_kernel_mps(TensorIteratorBase& iter) {
  lib.exec_unary_kernel(iter, "entr");
 }

-static void bessel_j0_kernel_mps(TensorIteratorBase& iter) {
-  lib.exec_unary_kernel(iter, "bessel_j0_forward");
-}
-
-static void bessel_j1_kernel_mps(TensorIteratorBase& iter) {
-  lib.exec_unary_kernel(iter, "bessel_j1_forward");
-}
-
-static void modified_bessel_i0_kernel_mps(TensorIteratorBase& iter) {
-  lib.exec_unary_kernel(iter, "modified_bessel_i0_forward");
-}
-
-static void modified_bessel_i1_kernel_mps(TensorIteratorBase& iter) {
-  lib.exec_unary_kernel(iter, "modified_bessel_i1_forward");
-}
-
-static void bessel_y0_kernel_mps(TensorIteratorBase& iter) {
-  lib.exec_unary_kernel(iter, "bessel_y0_forward");
-}
-
-static void bessel_y1_kernel_mps(TensorIteratorBase& iter) {
-  lib.exec_unary_kernel(iter, "bessel_y1_forward");
-}
-
 REGISTER_DISPATCH(i0_stub, &i0_kernel_mps)
-REGISTER_DISPATCH(special_i0e_stub, &i0e_kernel_mps)
 REGISTER_DISPATCH(special_i1_stub, &i1_kernel_mps)
-REGISTER_DISPATCH(special_i1e_stub, &i1e_kernel_mps)
-REGISTER_DISPATCH(special_bessel_j0_stub, &bessel_j0_kernel_mps)
-REGISTER_DISPATCH(special_bessel_j1_stub, &bessel_j1_kernel_mps)
-REGISTER_DISPATCH(special_modified_bessel_i0_stub, &modified_bessel_i0_kernel_mps)
-REGISTER_DISPATCH(special_modified_bessel_i1_stub, &modified_bessel_i1_kernel_mps)
-REGISTER_DISPATCH(special_bessel_y0_stub, &bessel_y0_kernel_mps)
-REGISTER_DISPATCH(special_bessel_y1_stub, &bessel_y1_kernel_mps)
 REGISTER_DISPATCH(special_spherical_bessel_j0_stub, &spherical_bessel_j0_kernel_mps)
 REGISTER_DISPATCH(special_entr_stub, &entr_kernel_mps)
 } // namespace at::native
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@ -280,7 +280,7 @@ Tensor& angle_out_mps(const Tensor& self, Tensor& output) {
    });
    return output;
  } else {
-    TORCH_CHECK(!self.is_complex(), "MPS does not support angle with complex input on macOS13")
+    TORCH_CHECK(!self.is_complex(), "MPS does not support angle with complex imput on macOS13")
    mps::unary_op(self, output, "angle_out_mps", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
      // On macOS 13 with non-complex input, realPartOfTensor and imaginaryPartOfTensor are
      // not available, and NaN is not propagated correctly:
--- a/aten/src/ATen/native/mps/operations/UpSample.mm
+++ b/aten/src/ATen/native/mps/operations/UpSample.mm
@ -9,7 +9,6 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
-#include <ATen/ops/_upsample_bicubic2d_aa_native.h>
 #include <ATen/ops/_upsample_bilinear2d_aa_backward_native.h>
 #include <ATen/ops/_upsample_bilinear2d_aa_native.h>
 #include <ATen/ops/_upsample_nearest_exact1d.h>
@ -468,16 +467,4 @@ TORCH_IMPL_FUNC(_upsample_bilinear2d_aa_out_mps)
  mps::upsample_kernel_out_template(input, output_size, align_corners, scales_h, scales_w, output, "bilinear2d_aa");
 }

-TORCH_IMPL_FUNC(_upsample_bicubic2d_aa_out_mps)
-(const Tensor& input,
- IntArrayRef output_size,
- bool align_corners,
- std::optional<double> scales_h,
- std::optional<double> scales_w,
- const Tensor& output) {
-  TORCH_CHECK(at::isFloatingType(input.scalar_type()),
-              "_upsample_bicubic2d_aa_out_mps only supports floating-point dtypes");
-  mps::upsample_kernel_out_template(input, output_size, align_corners, scales_h, scales_w, output, "bicubic2d_aa");
-}
-
 } // namespace at::native
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -2749,20 +2749,23 @@
  device_check: NoCheck   # TensorIterator
  variants: function, method
  dispatch:
-    CPU, CUDA, MPS: floor_divide
+    CPU, CUDA: floor_divide
+    MPS: floor_divide_mps
    SparseCPU, SparseCUDA: floor_divide_sparse

 - func: floor_divide_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
  variants: method
  dispatch:
-    CPU, CUDA, MPS: floor_divide_
+    CPU, CUDA: floor_divide_
+    MPS: floor_divide_mps_
    SparseCPU, SparseCUDA: floor_divide_sparse_

 - func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
  dispatch:
-    CPU, CUDA, MPS: floor_divide_out
+    CPU, CUDA: floor_divide_out
+    MPS: floor_divide_out_mps
    SparseCPU, SparseCUDA: floor_divide_out_sparse_zerodim

 - func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
@ -12766,7 +12769,6 @@
  dispatch:
    CPU: _upsample_bicubic2d_aa_out_cpu
    CUDA: _upsample_bicubic2d_aa_out_cuda
-    MPS: _upsample_bicubic2d_aa_out_mps

 - func: _upsample_bicubic2d_aa(Tensor self, SymInt[2] output_size, bool align_corners, float? scales_h=None, float? scales_w=None) -> Tensor
  python_module: nn
@ -13498,7 +13500,7 @@
  structured: True
  structured_inherits: TensorIteratorBase
  dispatch:
-    CPU, CUDA, MPS: special_i0e_out
+    CPU, CUDA: special_i0e_out
  tags: pointwise

 - func: special_i1(Tensor self) -> Tensor
@ -13526,7 +13528,7 @@
  structured: True
  structured_inherits: TensorIteratorBase
  dispatch:
-    CPU, CUDA, MPS: special_i1e_out
+    CPU, CUDA: special_i1e_out
  tags: pointwise

 - func: special_logit(Tensor self, float? eps=None) -> Tensor
@ -14988,7 +14990,7 @@

 - func: special_bessel_j0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
-    CPU, CUDA, MPS: special_bessel_j0_out
+    CPU, CUDA: special_bessel_j0_out
  python_module: special
  structured_inherits: TensorIteratorBase
  structured: True
@ -15003,7 +15005,7 @@

 - func: special_bessel_j1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
-    CPU, CUDA, MPS: special_bessel_j1_out
+    CPU, CUDA: special_bessel_j1_out
  python_module: special
  structured_inherits: TensorIteratorBase
  structured: True
@ -15018,7 +15020,7 @@

 - func: special_bessel_y0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
-    CPU, CUDA, MPS: special_bessel_y0_out
+    CPU, CUDA: special_bessel_y0_out
  python_module: special
  structured_inherits: TensorIteratorBase
  structured: True
@ -15033,7 +15035,7 @@

 - func: special_bessel_y1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
-    CPU, CUDA, MPS: special_bessel_y1_out
+    CPU, CUDA: special_bessel_y1_out
  python_module: special
  structured_inherits: TensorIteratorBase
  structured: True
@ -15440,7 +15442,7 @@

 - func: special_modified_bessel_i0.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
-    CPU, CUDA, MPS: special_modified_bessel_i0_out
+    CPU, CUDA: special_modified_bessel_i0_out
  python_module: special
  structured_inherits: TensorIteratorBase
  structured: True
@ -15455,7 +15457,7 @@

 - func: special_modified_bessel_i1.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
-    CPU, CUDA, MPS: special_modified_bessel_i1_out
+    CPU, CUDA: special_modified_bessel_i1_out
  python_module: special
  structured_inherits: TensorIteratorBase
  structured: True
--- a/aten/src/ATen/native/quantized/cpu/ACLUtils.cpp
+++ b/aten/src/ATen/native/quantized/cpu/ACLUtils.cpp
@ -1,359 +0,0 @@
-#include <ATen/native/quantized/cpu/ACLUtils.h>
-
-#if AT_MKLDNN_ACL_ENABLED()
-
-#include <ATen/Parallel.h>
-#ifndef AT_PER_OPERATOR_HEADERS
-#include <ATen/Functions.h>
-#else
-#include <ATen/ops/empty.h>
-#endif
-#include <arm_compute/core/Helpers.h>
-#include <arm_compute/core/Types.h>
-#include <arm_compute/core/Utils.h>
-#include <arm_compute/core/utils/quantization/AsymmHelpers.h>
-
-namespace at::native::acl_utils {
-
-QuantMatmul::QuantMatmul(
-    int64_t weight_dim_0,
-    int64_t weight_dim_1,
-    double weight_scale,
-    int64_t weight_offset,
-    int8_t* weight_ptr,
-    std::optional<float*> bias_ptr,
-    const QuantMatmulCacheKey& cache_key)
-    : key(cache_key) {
-  auto wei_q_tensor_info = arm_compute::TensorInfo(
-      arm_compute::TensorShape(weight_dim_1, weight_dim_0),
-      1,
-      arm_compute::DataType::QASYMM8_SIGNED,
-      arm_compute::QuantizationInfo(weight_scale, -weight_offset, false));
-  wei_q_tensor_info.set_are_values_constant(true);
-  wei_q_tensor_.allocator()->init(wei_q_tensor_info);
-  wei_q_tensor_.allocator()->import_memory(weight_ptr);
-
-  if (bias_ptr.has_value()) {
-    auto bia_tensor_info = arm_compute::TensorInfo(
-        arm_compute::TensorShape(1, weight_dim_1),
-        1,
-        arm_compute::DataType::F32);
-    bia_tensor_ = arm_compute::Tensor();
-
-    bia_tensor_->allocator()->init(bia_tensor_info);
-    bia_tensor_->allocator()->import_memory(bias_ptr.value());
-  }
-  const bool fuse_relu =
-      std::get<static_cast<int>(QuantMatmulCacheKeyIndex::FUSE_RELU)>(key);
-  if (fuse_relu) {
-    relu_info_ =
-        arm_compute::ActivationLayerInfo(arm_compute::ActivationFunction::RELU);
-  }
-}
-
-QuantMatmul::~QuantMatmul() {
-  // this will not free memory, it will just tell ACL that we're no longer
-  // using the pointer
-  wei_q_tensor_.allocator()->free();
-  if (bia_tensor_.has_value()) {
-    bia_tensor_->allocator()->free();
-  }
-}
-
-DynamicQuantMatmul::DynamicQuantMatmul(
-    int64_t weight_dim_0,
-    int64_t weight_dim_1,
-    double weight_scale,
-    int64_t weight_offset,
-    int8_t* weight_ptr,
-    std::optional<float*> bias_ptr,
-    const QuantMatmulCacheKey& cache_key)
-    : QuantMatmul(
-          weight_dim_0,
-          weight_dim_1,
-          weight_scale,
-          weight_offset,
-          weight_ptr,
-          bias_ptr,
-          cache_key) {
-  int64_t m = std::get<static_cast<int>(QuantMatmulCacheKeyIndex::M)>(key);
-
-  auto src_q_tensor_info = arm_compute::TensorInfo(
-      arm_compute::TensorShape(weight_dim_0, m),
-      1,
-      // ACL dyanamically quantized matmuls only support (signed) int8_t
-      arm_compute::DataType::QASYMM8_SIGNED,
-      // TODO: setting the initial offset value to int8_t max instead of zero,
-      // because ACL currently skips MatrixBReduction calculation if the
-      // source offset at configuration time is zero. This is fixed by this
-      // PR: https://review.mlplatform.org/c/ml/ComputeLibrary/+/12820/8 This
-      // will be set to the actual src offset value at runtime.
-      arm_compute::QuantizationInfo(
-          /*scale=*/1.0,
-          /*offset=*/std::numeric_limits<int8_t>::max(),
-          /*is_dynamic=*/true));
-  src_q_tensor_info.set_are_values_constant(false);
-
-  auto src_tensor_info = arm_compute::TensorInfo(
-      arm_compute::TensorShape(weight_dim_0, m), arm_compute::Format::F32);
-  src_tensor_info.set_are_values_constant(false);
-
-  auto dst_tensor_info = arm_compute::TensorInfo(
-      arm_compute::TensorShape(weight_dim_1, m), arm_compute::Format::F32);
-
-  src_q_tensor.allocator()->init(src_q_tensor_info);
-  src_tensor.allocator()->init(src_tensor_info);
-  dst_tensor.allocator()->init(dst_tensor_info);
-
-  src_q_tensor_orig_ =
-      at::empty({m, weight_dim_0}, at::device(c10::kCPU).dtype(c10::kQInt8));
-  // allocate/import memory
-  src_q_tensor.allocator()->import_memory(src_q_tensor_orig_.data_ptr());
-
-  if (relu_info_.has_value()) {
-    relu = arm_compute::NEActivationLayer();
-  }
-}
-
-DynamicQuantMatmul::~DynamicQuantMatmul() {
-  // this will not free memory, it will just tell ACL that we're no longer
-  // using the pointer
-  src_q_tensor.allocator()->free();
-}
-
-arm_compute::Status DynamicQuantMatmul::validate() {
-  if (relu_info_.has_value()) {
-    auto relu_status = arm_compute::NEActivationLayer::validate(
-        dst_tensor.info(), dst_tensor.info(), relu_info_.value());
-    if (relu_status.error_code() != arm_compute::ErrorCode::OK) {
-      return relu_status;
-    }
-  }
-  auto quant_status = arm_compute::NEQuantizationLayer::validate(
-      src_tensor.info(), src_q_tensor.info());
-  if (quant_status.error_code() != arm_compute::ErrorCode::OK) {
-    return quant_status;
-  }
-  return arm_compute::NEGEMMLowpMatrixMultiplyCore::validate(
-      src_q_tensor.info(),
-      wei_q_tensor_.info(),
-      bia_tensor_.has_value() ? bia_tensor_.value().info() : nullptr,
-      dst_tensor.info(),
-      gemm_info_);
-}
-
-void DynamicQuantMatmul::configure() {
-  quant.configure(&src_tensor, &src_q_tensor);
-  gemm.configure(
-      &src_q_tensor,
-      &wei_q_tensor_,
-      bia_tensor_.has_value() ? &bia_tensor_.value() : nullptr,
-      &dst_tensor,
-      gemm_info_);
-  if (relu.has_value()) {
-    relu->configure(&dst_tensor, &dst_tensor, relu_info_.value());
-  }
-}
-
-StaticQuantMatmul::StaticQuantMatmul(
-    int64_t weight_dim_0,
-    int64_t weight_dim_1,
-    double weight_scale,
-    int64_t weight_offset,
-    int8_t* weight_ptr,
-    std::optional<float*> bias_ptr,
-    const QuantMatmulCacheKey& cache_key)
-    : QuantMatmul(
-          weight_dim_0,
-          weight_dim_1,
-          weight_scale,
-          weight_offset,
-          weight_ptr,
-          bias_ptr,
-          cache_key) {
-  const int64_t m =
-      std::get<static_cast<int>(QuantMatmulCacheKeyIndex::M)>(key);
-  const int64_t input_zero_point =
-      std::get<static_cast<int>(QuantMatmulCacheKeyIndex::INPUT_OFFSET)>(key);
-  const double input_scale =
-      std::get<static_cast<int>(QuantMatmulCacheKeyIndex::INPUT_SCALE)>(key);
-  const int64_t output_zero_point =
-      std::get<static_cast<int>(QuantMatmulCacheKeyIndex::OUTPUT_OFFSET)>(key);
-  const double output_scale =
-      std::get<static_cast<int>(QuantMatmulCacheKeyIndex::OUTPUT_SCALE)>(key);
-  const bool signed_input =
-      std::get<static_cast<int>(QuantMatmulCacheKeyIndex::SIGNED_INPUT)>(key);
-
-  const auto input_acl_datatype = signed_input
-      ? arm_compute::DataType::QASYMM8_SIGNED
-      : arm_compute::DataType::QASYMM8;
-
-  auto src_q_tensor_info = arm_compute::TensorInfo(
-      arm_compute::TensorShape(weight_dim_0, m),
-      1,
-      input_acl_datatype,
-      arm_compute::QuantizationInfo(input_scale, -input_zero_point, false));
-  src_q_tensor_info.set_are_values_constant(false);
-  src_q_tensor.allocator()->init(src_q_tensor_info);
-
-  if (bias_ptr.has_value()) {
-    auto bia_q_tensor_info = arm_compute::TensorInfo(
-        arm_compute::TensorShape(1, weight_dim_1),
-        1,
-        arm_compute::DataType::S32,
-        arm_compute::QuantizationInfo(
-            1 / (input_scale * weight_scale), 0, false));
-    bia_q_tensor_ = arm_compute::Tensor();
-    bia_q_tensor_.value().allocator()->init(bia_q_tensor_info);
-
-    float* bias_fp32_buffer = (float*)bia_tensor_.value().buffer();
-    bia_q_tensor_orig_ =
-        at::empty({m, weight_dim_0}, at::device(c10::kCPU).dtype(c10::kQInt32));
-    int32_t* bias_s32_buffer = (int32_t*)bia_q_tensor_orig_.value().data_ptr();
-    const float bias_scale =
-        bia_q_tensor_info.quantization_info().uniform().scale;
-    // Quantize the bias to int32_t. It makes sense to do it here rather in the
-    // prepack phase because dynamically quantized ACL matmuls don't need the
-    // bias in int32_t.
-    at::parallel_for(0, weight_dim_1, 1, [&](int64_t start, int64_t end) {
-      for (int64_t i = start; i < end; ++i) {
-        bias_s32_buffer[i] =
-            int32_t(std::round(bias_fp32_buffer[i] * bias_scale));
-      }
-    });
-    bia_q_tensor_.value().allocator()->import_memory(bias_s32_buffer);
-  }
-  auto dst_q_tensor_info = arm_compute::TensorInfo(
-      arm_compute::TensorShape(weight_dim_1, m),
-      1,
-      input_acl_datatype,
-      arm_compute::QuantizationInfo(output_scale, output_zero_point, false));
-  dst_q_tensor.allocator()->init(dst_q_tensor_info);
-
-  // Setup lowp_gemm output stage
-  int output_multiplier;
-  int output_shift;
-  float multiplier = (input_scale * weight_scale) / output_scale;
-  arm_compute::quantization::calculate_quantized_multiplier_less_than_one(
-      multiplier, &output_multiplier, &output_shift);
-
-  arm_compute::GEMMLowpOutputStageInfo output_stage_info;
-  output_stage_info.type =
-      arm_compute::GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
-  output_stage_info.gemmlowp_multiplier = output_multiplier;
-  output_stage_info.gemmlowp_shift = output_shift;
-  output_stage_info.gemmlowp_offset = output_zero_point;
-
-  int32_t min_activation = signed_input ? std::numeric_limits<int8_t>::min()
-                                        : std::numeric_limits<uint8_t>::min();
-  int32_t max_activation = signed_input ? std::numeric_limits<int8_t>::max()
-                                        : std::numeric_limits<uint8_t>::max();
-
-  if (relu_info_.has_value()) {
-    // figure out min, max values for ReLU
-    const arm_compute::UniformQuantizationInfo uqinfo =
-        dst_q_tensor_info.quantization_info().uniform();
-    std::tie(min_activation, max_activation) =
-        arm_compute::get_quantized_activation_min_max(
-            relu_info_.value(), src_q_tensor_info.data_type(), uqinfo);
-    // fuse ReLU with the GEMM
-    gemm_info_.set_activation_info(relu_info_.value());
-  }
-  output_stage_info.gemmlowp_min_bound = min_activation;
-  output_stage_info.gemmlowp_max_bound = max_activation;
-  output_stage_info.output_data_type = dst_q_tensor_info.data_type();
-
-  gemm_info_.set_gemmlowp_output_stage(output_stage_info);
-}
-
-StaticQuantMatmul::~StaticQuantMatmul() {
-  // this will not free memory, it will just tell ACL that we're no longer
-  // using the pointer
-  if (bia_q_tensor_.has_value()) {
-    bia_q_tensor_.value().allocator()->free();
-  }
-}
-
-arm_compute::Status StaticQuantMatmul::validate() {
-  return arm_compute::NEGEMMLowpMatrixMultiplyCore::validate(
-      src_q_tensor.info(),
-      wei_q_tensor_.info(),
-      bia_q_tensor_.has_value() ? bia_q_tensor_.value().info() : nullptr,
-      dst_q_tensor.info(),
-      gemm_info_);
-}
-
-void StaticQuantMatmul::configure() {
-  gemm.configure(
-      &src_q_tensor,
-      &wei_q_tensor_,
-      bia_q_tensor_.has_value() ? &bia_q_tensor_.value() : nullptr,
-      &dst_q_tensor,
-      gemm_info_);
-}
-
-QuantAdd::QuantAdd(
-    arm_compute::DataType dtype,
-    const std::vector<int64_t>& input_dims,
-    double qa_scale,
-    int64_t qa_offset,
-    double qb_scale,
-    int64_t qb_offset,
-    double dst_scale,
-    int64_t dst_offset) {
-  arm_compute::QuantizationInfo qa_qinfo = {
-      static_cast<float>(qa_scale), static_cast<int32_t>(qa_offset), false};
-  arm_compute::QuantizationInfo qb_qinfo = {
-      static_cast<float>(qb_scale), static_cast<int32_t>(qb_offset), false};
-  arm_compute::QuantizationInfo qdst_qinfo = {
-      static_cast<float>(dst_scale), static_cast<int32_t>(dst_offset), false};
-
-  arm_compute::TensorShape qa_acl_tensor_shape;
-  arm_compute::TensorShape qb_acl_tensor_shape;
-  arm_compute::TensorShape qdst_acl_tensor_shape;
-  for (int i = input_dims.size() - 1; i >= 0; i--) {
-    qa_acl_tensor_shape.set(i, input_dims[i], false, true);
-    qb_acl_tensor_shape.set(i, input_dims[i], false, true);
-    qdst_acl_tensor_shape.set(i, input_dims[i], false, true);
-  }
-  arm_compute::TensorInfo qa_acl_tensor_info(
-      qa_acl_tensor_shape, 1, dtype, qa_qinfo);
-  arm_compute::TensorInfo qb_acl_tensor_info(
-      qb_acl_tensor_shape, 1, dtype, qb_qinfo);
-  arm_compute::TensorInfo qdst_acl_tensor_info(
-      qdst_acl_tensor_shape, 1, dtype, qdst_qinfo);
-
-  qa_tensor.allocator()->init(qa_acl_tensor_info);
-  qb_tensor.allocator()->init(qb_acl_tensor_info);
-  qdst_tensor.allocator()->init(qdst_acl_tensor_info);
-}
-
-arm_compute::Status QuantAdd::validate() {
-  return q_add.validate(
-      qa_tensor.info(), qb_tensor.info(), qdst_tensor.info(), policy);
-}
-
-void QuantAdd::configure() {
-  q_add.configure(&qa_tensor, &qb_tensor, &qdst_tensor, policy);
-}
-
-} // namespace at::native::acl_utils
-
-PackedLinearWeightsACL::PackedLinearWeightsACL(
-    std::unique_ptr<ideep::tensor> weight,
-    std::optional<ideep::tensor> bias,
-    at::Tensor orig_weight,
-    std::optional<at::Tensor> orig_bias)
-    : PackedLinearWeightsOnednn(
-          std::move(weight),
-          std::move(bias),
-          std::move(orig_weight),
-          std::move(orig_bias)) {
-  auto w = *(weight_.get());
-  k_ = w.get_dim(0);
-  n_ = w.get_dim(1);
-  weight_zero_point_ = orig_weight_.q_zero_point();
-  weight_scale_ = orig_weight_.q_scale();
-}
-
-#endif // AT_MKLDNN_ACL_ENABLED()
--- a/aten/src/ATen/native/quantized/cpu/ACLUtils.h
+++ b/aten/src/ATen/native/quantized/cpu/ACLUtils.h
@ -1,257 +0,0 @@
-#pragma once
-
-#include <ATen/Config.h>
-#if AT_MKLDNN_ACL_ENABLED()
-
-#include <ATen/native/quantized/cpu/OnednnUtils.h>
-#include <arm_compute/core/Error.h>
-#include <arm_compute/core/TensorInfo.h>
-#include <arm_compute/function_info/ActivationLayerInfo.h>
-#include <arm_compute/runtime/NEON/functions/NEActivationLayer.h>
-#include <arm_compute/runtime/NEON/functions/NEArithmeticAddition.h>
-#include <arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h>
-#include <arm_compute/runtime/NEON/functions/NEQuantizationLayer.h>
-#include <arm_compute/runtime/Tensor.h>
-#include <array>
-
-// Utilities for Arm Compute Library (ACL) quantized operations
-// Provides interfaces to leverage ACL's accelerated kernels for statically and
-// dynamically quantized matmuls (i.e. qlinear and qlinear_dynamic) These are
-// utalized through PackedLinearWeightsACL which extends
-// PackedLinearWeightsOnednn Note that PackedLinearWeightsACL extends rather
-// than replaces PackedLinearWeightsOnednn for AArch64 because ACL currently
-// only supports per_tensor weight quantization.
-namespace at::native::acl_utils {
-
-using QuantMatmulCacheKey = std::tuple<
-    int64_t, // M
-    bool, // FUSE_RELU
-    int64_t, // NUM_THREADS
-    double, // INPUT_SCALE
-    int64_t, // INPUT_OFFSET
-    double, // OUTPUT_SCALE
-    int64_t, // OUTPUT_OFFSET
-    bool // SIGNED_INPUT
-    >;
-
-enum class QuantMatmulCacheKeyIndex {
-  M,
-  FUSE_RELU,
-  NUM_THREADS,
-  INPUT_SCALE,
-  INPUT_OFFSET,
-  OUTPUT_SCALE,
-  OUTPUT_OFFSET,
-  SIGNED_INPUT
-};
-
-// Abstract interface to share common stuff between static/dynamic ACL matmuls.
-struct QuantMatmul {
-  arm_compute::NEGEMMLowpMatrixMultiplyCore gemm;
-  // key for use in the cache
-  QuantMatmulCacheKey key;
-
-  QuantMatmul(
-      int64_t weight_dim_0,
-      int64_t weight_dim_1,
-      double weight_scale,
-      int64_t weight_offset,
-      int8_t* weight_ptr,
-      std::optional<float*> bias_ptr,
-      const QuantMatmulCacheKey& cache_key);
-
-  virtual ~QuantMatmul();
-  virtual arm_compute::Status validate() = 0;
-  virtual void configure() = 0;
-
- protected:
-  arm_compute::Tensor wei_q_tensor_;
-  std::optional<arm_compute::Tensor> bia_tensor_;
-  arm_compute::GEMMInfo gemm_info_;
-  std::optional<arm_compute::ActivationLayerInfo> relu_info_;
-};
-
-struct DynamicQuantMatmul : public QuantMatmul {
-  arm_compute::Tensor src_q_tensor;
-  arm_compute::Tensor src_tensor;
-  arm_compute::Tensor dst_tensor;
-  arm_compute::NEQuantizationLayer quant;
-  // We need a ReLU layer here (unlike static quantization) because the ReLU
-  // cannot be "truly" fused with the GEMM through gemm_info in ACL dynamically
-  // quantized matmuls.
-  std::optional<arm_compute::NEActivationLayer> relu;
-
-  DynamicQuantMatmul(
-      int64_t weight_dim_0,
-      int64_t weight_dim_1,
-      double weight_scale,
-      int64_t weight_offset,
-      int8_t* weight_ptr,
-      std::optional<float*> bias_ptr,
-      const QuantMatmulCacheKey& cache_key);
-
-  ~DynamicQuantMatmul() override;
-
-  arm_compute::Status validate() override;
-  void configure() override;
-
- private:
-  at::Tensor src_q_tensor_orig_;
-};
-
-struct StaticQuantMatmul : public QuantMatmul {
-  arm_compute::Tensor src_q_tensor;
-  arm_compute::Tensor dst_q_tensor;
-
-  StaticQuantMatmul(
-      int64_t weight_dim_0,
-      int64_t weight_dim_1,
-      double weight_scale,
-      int64_t weight_offset,
-      int8_t* weight_ptr,
-      std::optional<float*> bias_ptr,
-      const QuantMatmulCacheKey& cache_key);
-
-  ~StaticQuantMatmul() override;
-
-  arm_compute::Status validate() override;
-  void configure() override;
-
- private:
-  std::optional<arm_compute::Tensor> bia_q_tensor_;
-  std::optional<at::Tensor> bia_q_tensor_orig_;
-};
-
-struct QuantAdd {
-  arm_compute::Tensor qa_tensor;
-  arm_compute::Tensor qb_tensor;
-  arm_compute::Tensor qdst_tensor;
-  arm_compute::NEArithmeticAddition q_add;
-
-  QuantAdd(
-      arm_compute::DataType dtype,
-      const std::vector<int64_t>& input_dims,
-      double qa_scale,
-      int64_t qa_offset,
-      double qb_scale,
-      int64_t qb_offset,
-      double dst_scale,
-      int64_t dst_offset);
-
-  arm_compute::Status validate();
-  void configure();
-
- private:
-  arm_compute::ConvertPolicy policy{arm_compute::ConvertPolicy::SATURATE};
-};
-
-} // namespace at::native::acl_utils
-struct PackedLinearWeightsACL : public PackedLinearWeightsOnednn {
-  using ACLQuantMatmul = at::native::acl_utils::QuantMatmul;
-  using ACLDynamicQuantMatmul = at::native::acl_utils::DynamicQuantMatmul;
-  using ACLStaticQuantMatmul = at::native::acl_utils::StaticQuantMatmul;
-  using ACLQuantMatmulCacheKey = at::native::acl_utils::QuantMatmulCacheKey;
-  using ACLQuantMatmulCacheKeyIndex =
-      at::native::acl_utils::QuantMatmulCacheKeyIndex;
-
-  PackedLinearWeightsACL(
-      std::unique_ptr<ideep::tensor> weight,
-      std::optional<ideep::tensor> bias,
-      at::Tensor orig_weight,
-      std::optional<at::Tensor> orig_bias);
-
-  at::Tensor apply_dynamic(at::Tensor input, bool reduce_range = false)
-      override;
-  at::Tensor apply_dynamic_relu(at::Tensor input, bool reduce_range = false)
-      override;
-
-  at::Tensor apply(
-      at::Tensor input,
-      double output_scale,
-      int64_t output_zero_point) override;
-  at::Tensor apply_relu(
-      at::Tensor input,
-      double output_scale,
-      int64_t output_zero_point) override;
-
-  template <typename ACLQuantMatmulT>
-  std::shared_ptr<ACLQuantMatmulT> get_acl_quant_matmul(
-      const ACLQuantMatmulCacheKey& key) {
-    return std::dynamic_pointer_cast<ACLQuantMatmulT>(
-        fetch_or_create_acl_quant_matmul<ACLQuantMatmulT>(key));
-  }
-
- private:
-  int64_t k_;
-  int64_t n_;
-  int64_t weight_zero_point_;
-  double weight_scale_;
-
-  // A 2 element (per layer) cache. Given it's not intended to store more than 2
-  // elements, we do not need a fancy implementation. The idea behind it is to
-  // allow for a (configuration free) fast path for autoregressive
-  // transformer-like models which usually involve 2 input tensor shapes; one
-  // for the prefill phase and another for the autoregressive phase
-  std::array<std::shared_ptr<ACLQuantMatmul>, 2> cache_;
-
-  template <typename ACLQuantMatmulT>
-  std::shared_ptr<ACLQuantMatmul> fetch_or_create_acl_quant_matmul(
-      const ACLQuantMatmulCacheKey& key) {
-    // We're only maintaining a 2 element LRU cache
-    // hit first
-    if (cache_[0] != nullptr && cache_[0]->key == key) {
-      return cache_[0];
-    }
-    // hit second
-    if (cache_[1] != nullptr && cache_[1]->key == key) {
-      // Update LRU
-      std::swap(cache_[0], cache_[1]);
-      return cache_[0];
-    }
-    // miss -> replace Least Recently Used - i.e. element at index 1
-    cache_[1] = create_acl_quant_matmul<ACLQuantMatmulT>(key);
-    std::swap(cache_[0], cache_[1]);
-    return cache_[0];
-  }
-
-  template <typename ACLQuantMatmulT>
-  std::shared_ptr<ACLQuantMatmulT> create_acl_quant_matmul(
-      const ACLQuantMatmulCacheKey& key) {
-    std::optional<float*> bias_ptr;
-    if (bias_.has_value()) {
-      bias_ptr = (float*)bias_.value().get_data_handle();
-    }
-    auto acl_gemm = std::make_shared<ACLQuantMatmulT>(
-        k_,
-        n_,
-        weight_scale_,
-        weight_zero_point_,
-        (int8_t*)weight_.get()->get_data_handle(),
-        bias_ptr,
-        key);
-
-    // validate
-    auto status = acl_gemm->validate();
-    if (status.error_code() != arm_compute::ErrorCode::OK) {
-      TORCH_WARN(
-          "Arm Compute Library's Quantized Matmul Validation Failed: " +
-          status.error_description());
-      return nullptr;
-    }
-
-    // configure
-    acl_gemm->configure();
-    return acl_gemm;
-  }
-
-  template <bool ReluFused>
-  at::Tensor apply_dynamic_impl(at::Tensor input, bool reduce_range = false);
-
-  template <bool ReluFused>
-  at::Tensor apply_impl(
-      at::Tensor input,
-      double output_scale,
-      int64_t output_zero_point);
-};
-
-#endif // AT_MKLDNN_ACL_ENABLED()
--- a/aten/src/ATen/native/quantized/cpu/BinaryOps.cpp
+++ b/aten/src/ATen/native/quantized/cpu/BinaryOps.cpp
@ -5,7 +5,6 @@
 #include <ATen/ExpandUtils.h>
 #include <torch/library.h>
 #include <ATen/quantized/Quantizer.h>
-#include <ATen/native/quantized/cpu/ACLUtils.h>
 #include <ATen/native/quantized/cpu/BinaryOps.h>
 #include <ATen/native/quantized/cpu/QuantizedOps.h>
 #include <ATen/native/quantized/cpu/init_qnnpack.h>
@ -385,67 +384,6 @@ Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
 }
 #endif // USE_XNNPACK

-#if AT_MKLDNN_ACL_ENABLED()
-Tensor acl_qadd(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
-  TORCH_CHECK(
-      qa.qscheme() == kPerTensorAffine || qa.qscheme() == kPerTensorSymmetric,
-      "Only per tensor quantization is supported in ACL quantized add.");
-
-  Tensor qa_contig = qa.contiguous(qa.suggest_memory_format());
-  Tensor qb_contig = qb.contiguous(qa.suggest_memory_format());
-  auto qa_mem_format = qa_contig.suggest_memory_format();
-  Tensor dst = at::native::empty_affine_quantized(
-      at::infer_size_dimvector(qa_contig.sizes(), qb_contig.sizes()),
-      qa_contig.scalar_type(),
-      std::nullopt /* layout */,
-      kCPU,
-      std::nullopt /* pin_memory */,
-      scale,
-      zero_point,
-      qa_mem_format);
-
-  if (qb_contig.size(0) == 0) {
-    return dst;
-  }
-
-  auto input_dims = qa_contig.sizes().vec();
-  auto acl_dtype = dst.scalar_type() == kQInt8
-      ? arm_compute::DataType::QASYMM8_SIGNED
-      : arm_compute::DataType::QASYMM8;
-  auto acl_add = std::make_shared<acl_utils::QuantAdd>(
-      acl_dtype,
-      input_dims,
-      qa_contig.q_scale(),
-      qa_contig.q_zero_point(),
-      qb_contig.q_scale(),
-      qb_contig.q_zero_point(),
-      dst.q_scale(),
-      dst.q_zero_point());
-
-  auto status = acl_add->validate();
-  TORCH_CHECK(
-      status.error_code() == arm_compute::ErrorCode::OK,
-      "Arm Compute Library's Quantized Matmul Validation Failed: " +
-          status.error_description());
-
-  acl_add->configure();
-
-  acl_add->qa_tensor.allocator()->import_memory(qa_contig.data_ptr());
-  acl_add->qb_tensor.allocator()->import_memory(qb_contig.data_ptr());
-  acl_add->qdst_tensor.allocator()->import_memory(dst.data_ptr());
-
-  acl_add->q_add.run();
-
-  // this will not free memory, it will just tell ACL that we're no longer
-  // using the pointer
-  acl_add->qa_tensor.allocator()->free();
-  acl_add->qb_tensor.allocator()->free();
-  acl_add->qdst_tensor.allocator()->free();
-
-  return dst;
-}
-#endif // AT_MKLDNN_ACL_ENABLED()
-
 template <bool ReLUFused = false>
 Tensor qadd(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
  check_inputs(qa, qb);
@ -468,15 +406,6 @@ Tensor qadd(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
    }
 #endif // USE_PYTORCH_QNNPACK
  }
-
-#if AT_MKLDNN_ACL_ENABLED()
-  if (!ReLUFused && qa.ndimension() > 0 && qa.sizes() == qb.sizes() &&
-      qa.scalar_type() == qb.scalar_type() &&
-      (qa.scalar_type() == kQInt8 || qa.scalar_type() == kQUInt8)) {
-    return acl_qadd(qa, qb, scale, zero_point);
-  }
-#endif // AT_MKLDNN_ACL_ENABLED()
-
  auto qc = at::_empty_affine_quantized(
      qa.sizes(),
      at::device(kCPU)
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@ -1,18 +1,17 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
 #include <ATen/Context.h>
 #include <ATen/Parallel.h>
 #include <ATen/TensorOperators.h>
-#include <ATen/core/Tensor.h>
-#include <ATen/native/mkldnn/MKLDNNCommon.h>
-#include <ATen/native/quantized/PackedParams.h>
-#include <ATen/native/quantized/cpu/ACLUtils.h>
-#include <ATen/native/quantized/cpu/OnednnUtils.h>
-#include <ATen/native/quantized/cpu/QnnpackUtils.h>
-#include <ATen/native/quantized/cpu/QuantUtils.h>
-#include <ATen/native/quantized/cpu/XnnpackUtils.h>
 #include <ATen/native/quantized/cpu/fbgemm_utils.h>
+#include <ATen/native/quantized/cpu/QnnpackUtils.h>
+#include <ATen/native/quantized/cpu/XnnpackUtils.h>
+#include <ATen/native/quantized/cpu/OnednnUtils.h>
+#include <ATen/native/quantized/cpu/QuantUtils.h>
 #include <ATen/native/quantized/cpu/qlinear.h>
 #include <ATen/native/quantized/library.h>
+#include <ATen/native/quantized/PackedParams.h>
+#include <ATen/native/mkldnn/MKLDNNCommon.h>
 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>
 #include <torch/library.h>

@ -1108,96 +1107,6 @@ static at::Tensor linear_int8_with_onednn_weight(
  primitive.execute(ideep::stream::default_stream(), args);
  return dim == 2 ? output : output.reshape(output_size);
 }
-
-#if AT_MKLDNN_ACL_ENABLED()
-
-template <bool ReluFused>
-at::Tensor PackedLinearWeightsACL::apply_impl(
-    at::Tensor input,
-    double output_scale,
-    int64_t output_zero_point) {
-  const int64_t dim = input.dim();
-  TORCH_CHECK(
-      dim != 0, "qlinear (ACL): input dim should be at least 1, but got 0");
-  TORCH_CHECK(
-      input.scalar_type() == c10::ScalarType::QUInt8 ||
-          input.scalar_type() == c10::ScalarType::QInt8,
-      "qlinear (ACL): data type of input should be QUInt8 or QInt8.");
-
-  auto input_contig = input.expect_contiguous();
-
-  int64_t m = input.numel() / k_;
-  double input_scale = input.q_scale();
-  int64_t input_zero_point = input.q_zero_point();
-  auto is_input_qint8 = input.scalar_type() == c10::ScalarType::QInt8;
-  auto key = std::make_tuple(
-      m,
-      ReluFused,
-      static_cast<int64_t>(at::get_num_threads()),
-      input_scale,
-      input_zero_point,
-      output_scale,
-      output_zero_point,
-      is_input_qint8);
-
-  auto acl_gemm =
-      get_acl_quant_matmul<at::native::acl_utils::StaticQuantMatmul>(key);
-  if (acl_gemm) {
-    acl_gemm->src_q_tensor.allocator()->import_memory(input_contig->data_ptr());
-
-    auto dst_dims = {m, n_};
-    at::Tensor output = at::_empty_affine_quantized(
-        dst_dims,
-        at::device(c10::kCPU).dtype(
-            is_input_qint8 ? c10::kQInt8 : c10::kQUInt8),
-        output_scale,
-        output_zero_point);
-
-    if (output.numel() == 0) {
-      return output;
-    }
-
-    acl_gemm->dst_q_tensor.allocator()->import_memory(output.data_ptr());
-
-    acl_gemm->gemm.run();
-
-    acl_gemm->src_q_tensor.allocator()->free();
-    acl_gemm->dst_q_tensor.allocator()->free();
-
-    auto out_sizes = input.sizes().vec();
-    out_sizes.back() = n_;
-
-    if (output.sizes().vec() == out_sizes)
-      return output;
-    return output.reshape(out_sizes);
-  }
-  // fallback to oneDNN in the unlikely scinario that ACL's validation fails
-  if (ReluFused) {
-    return PackedLinearWeightsOnednn::apply_relu(
-        input, output_scale, output_zero_point);
-  } else {
-    return PackedLinearWeightsOnednn::apply(
-        input, output_scale, output_zero_point);
-  }
-}
-
-at::Tensor PackedLinearWeightsACL::apply(
-    at::Tensor input,
-    double output_scale,
-    int64_t output_zero_point) {
-  return apply_impl</*ReluFused=*/false>(
-      std::move(input), output_scale, output_zero_point);
-}
-
-at::Tensor PackedLinearWeightsACL::apply_relu(
-    at::Tensor input,
-    double output_scale,
-    int64_t output_zero_point) {
-  return apply_impl</*ReluFused=*/true>(
-      std::move(input), output_scale, output_zero_point);
-}
-
-#endif // AT_MKLDNN_ACL_ENABLED()
 #endif // #if AT_MKLDNN_ENABLED()

 namespace at::native {
--- a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
@ -5,7 +5,6 @@
 #include <ATen/native/quantized/cpu/fbgemm_utils.h>
 #include <ATen/native/quantized/cpu/QnnpackUtils.h>
 #include <ATen/native/quantized/cpu/OnednnUtils.h>
-#include <ATen/native/quantized/cpu/ACLUtils.h>
 #include <ATen/native/quantized/cpu/QuantUtils.h>
 #include <ATen/native/quantized/library.h>
 #include <ATen/native/quantized/PackedParams.h>
@ -698,135 +697,6 @@ static at::Tensor linear_dynamic_fp16_with_onednn_weight(
  primitive.execute(ideep::stream::default_stream(), args);
  return dim == 2 ? output : output.reshape(output_size);
 }
-
-#if AT_MKLDNN_ACL_ENABLED()
-
-template <bool ReluFused>
-at::Tensor PackedLinearWeightsACL::apply_dynamic_impl(
-    at::Tensor input,
-    bool reduce_range) {
-  // Dynamic: fp32 * int8 -> fp32
-  using at::Tensor;
-
-  TORCH_CHECK(
-      input.dim() >= 2,
-      "The dimension of input tensor should be larger than or equal to 2");
-  TORCH_CHECK(
-      input.scalar_type() == c10::ScalarType::Float,
-      "qlinear_dynamic (ACL): data type of input should be float.");
-
-  auto input_contig = input.contiguous();
-  const int64_t dim = input.dim();
-  auto input_reshaped =
-      dim == 2 ? input : input.reshape({-1, input.size(input.dim() - 1)});
-  auto input_dims = input_reshaped.sizes().vec();
-
-  int64_t m = input_dims[0];
-  auto key = std::make_tuple(
-      m, /* M */
-      ReluFused, /* FUSE_RELU */
-      static_cast<int64_t>(at::get_num_threads()), /* NUM_THREADS */
-      1, /* INPUT_SCALE */
-      0, /* INPUT_OFFSET */
-      1, /* OUTPUT_SCALE */
-      0, /* OUTPUT_OFFSET */
-      true /* SIGNED_INPUT */
-  );
-  auto acl_gemm =
-      get_acl_quant_matmul<at::native::acl_utils::DynamicQuantMatmul>(key);
-
-  if (acl_gemm) {
-    // Find quantization parameters
-    float x_max = 0, x_min = 0;
-
-#ifdef USE_FBGEMM
-    // Use FBGEMM's FindMinMax if available since it's faster
-    fbgemm::FindMinMax(
-        /*m=*/input_contig.data_ptr<float>(),
-        /*min=*/&x_min,
-        /*max=*/&x_max,
-        /*len=*/input.numel());
-#else
-    if (input_contig.numel() > 0) {
-      auto [t_min, t_max] = at::aminmax(input_contig);
-      x_max = t_max.item<float>();
-      x_min = t_min.item<float>();
-    }
-#endif
-
-    auto q_params = quant_utils::ChooseQuantizationParams(
-        /*min=*/x_min,
-        /*max=*/x_max,
-        /*qmin=*/std::numeric_limits<int8_t>::min(),
-        /*qmax=*/std::numeric_limits<int8_t>::max(),
-        /*preserve_sparsity=*/false,
-        /*force_scale_power_of_two=*/false,
-        /*reduce_range=*/reduce_range);
-
-    acl_gemm->src_tensor.allocator()->import_memory(
-        (float*)input_contig.data_ptr());
-
-    acl_gemm->src_q_tensor.info()->set_quantization_info(
-        arm_compute::QuantizationInfo(
-            q_params.scale, q_params.zero_point, true));
-
-    // quantize src tensor: fp32 -> s8
-    acl_gemm->quant.run();
-
-    // allocation for fp32 out tensor
-    auto output = at::empty({m, n_}, input.options().dtype(at::kFloat));
-    if (output.numel() == 0)
-      return output;
-
-    // We set the offset to "-zero_point" for the GEMM, but to "zero_point" for
-    // the quantization layer This is a known inconsistency in ACL.
-    acl_gemm->src_q_tensor.info()->set_quantization_info(
-        arm_compute::QuantizationInfo(
-            q_params.scale, -q_params.zero_point, true));
-
-    acl_gemm->dst_tensor.allocator()->import_memory((float*)output.data_ptr());
-
-    // s8 src, s8 wei -> f32 dst
-    acl_gemm->gemm.run();
-
-    if (acl_gemm->relu.has_value()) {
-      acl_gemm->relu->run();
-    }
-
-    // this will not free memory, it will just tell ACL that we're no longer
-    // using the pointer
-    acl_gemm->src_tensor.allocator()->free();
-    acl_gemm->dst_tensor.allocator()->free();
-
-    auto out_sizes = input.sizes().vec();
-    out_sizes.back() = n_;
-    if (output.sizes().vec() == out_sizes)
-      return output;
-    return output.reshape(out_sizes);
-  }
-
-  // fallback to oneDNN in the unlikely scinario that ACL's validation fails
-  if (ReluFused) {
-    return PackedLinearWeightsOnednn::apply_dynamic_relu(input, reduce_range);
-  } else {
-    return PackedLinearWeightsOnednn::apply_dynamic(input, reduce_range);
-  }
-}
-
-at::Tensor PackedLinearWeightsACL::apply_dynamic(
-    at::Tensor input,
-    bool reduce_range) {
-  return apply_dynamic_impl</*ReluFused=*/false>(
-      std::move(input), reduce_range);
-}
-
-at::Tensor PackedLinearWeightsACL::apply_dynamic_relu(
-    at::Tensor input,
-    bool reduce_range) {
-  return apply_dynamic_impl</*ReluFused=*/true>(std::move(input), reduce_range);
-}
-
-#endif // #if AT_MKLDNN_ACL_ENABLED()
 #endif // #if AT_MKLDNN_ENABLED()

 namespace at::native {
--- a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
@ -1,16 +1,15 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-#include <ATen/Context.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/cpp_custom_type_hack.h>
-#include <ATen/native/mkldnn/MKLDNNCommon.h>
-#include <ATen/native/quantized/PackedParams.h>
-#include <ATen/native/quantized/cpu/ACLUtils.h>
-#include <ATen/native/quantized/cpu/OnednnUtils.h>
-#include <ATen/native/quantized/cpu/QnnpackUtils.h>
-#include <ATen/native/quantized/cpu/QuantUtils.h>
+#include <ATen/Context.h>
 #include <ATen/native/quantized/cpu/fbgemm_utils.h>
 #include <ATen/native/quantized/cpu/init_qnnpack.h>
+#include <ATen/native/quantized/cpu/QnnpackUtils.h>
+#include <ATen/native/quantized/cpu/OnednnUtils.h>
+#include <ATen/native/quantized/cpu/QuantUtils.h>
 #include <ATen/native/quantized/library.h>
+#include <ATen/native/quantized/PackedParams.h>
+#include <ATen/native/mkldnn/MKLDNNCommon.h>
 #include <ATen/quantized/Quantizer.h>
 #include <torch/custom_class.h>
 #include <torch/library.h>
@ -280,15 +279,12 @@ c10::intrusive_ptr<LinearPackedParamsBase> PackedLinearWeightsOnednn::prepack(
    packed_bias.init(bias_desc, b.data_ptr());
    onednn_bias = std::optional<ideep::tensor>(packed_bias);
  }
-#if AT_MKLDNN_ACL_ENABLED()
-  if (qtype == c10::kPerTensorAffine) {
-    return c10::make_intrusive<PackedLinearWeightsACL>(PackedLinearWeightsACL{
-        std::move(weight_ptr), onednn_bias, weight, bias});
-  }
-#endif // #if AT_MKLDNN_ACL_ENABLED()
-  auto ret_ptr =
-      c10::make_intrusive<PackedLinearWeightsOnednn>(PackedLinearWeightsOnednn{
-          std::move(weight_ptr), onednn_bias, weight, bias});
+  auto ret_ptr = c10::make_intrusive<PackedLinearWeightsOnednn>(
+      PackedLinearWeightsOnednn{
+        std::move(weight_ptr),
+        onednn_bias,
+        weight,
+        bias});
  return ret_ptr;
 }

--- a/aten/src/ATen/native/transformers/attention.cpp
+++ b/aten/src/ATen/native/transformers/attention.cpp
@ -759,28 +759,6 @@ Tensor scaled_dot_product_attention(
          && !(GradMode::is_enabled() && any_inputs_require_grad)
          && (all_contiguous || mps::is_macos_13_or_newer(mps::MacOSVersion::MACOS_VER_15_0_PLUS))
          && !any_nested) {
-        if (enable_gqa) {
-          int64_t q_heads = query_.size(-3);
-          int64_t k_heads = key.size(-3);
-          int64_t repeat_factor = q_heads / k_heads;
-
-          if (repeat_factor > 1) {
-            TORCH_CHECK(q_heads % k_heads == 0,
-                          "For GQA, the query tensor's head dimension (" + std::to_string(q_heads) +
-                                    ") must be divisible by the key tensor's head dimension (" + std::to_string(k_heads) + ").");
-            auto repeated_key = key.repeat_interleave(repeat_factor, /*dim=*/-3);
-            auto repeated_value = value.repeat_interleave(repeat_factor, /*dim=*/-3);
-            return std::get<0>(at::_scaled_dot_product_attention_math_for_mps(
-              query_,
-              repeated_key,
-              repeated_value,
-              attn_mask,
-              dropout_p,
-              is_causal,
-              std::nullopt, /*dropout_mask*/
-              scale));
-          }
-        }
        return std::get<0>(at::_scaled_dot_product_attention_math_for_mps(
            query_,
            key,
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@ -1408,7 +1408,7 @@ class AOTInductorModelCache:
    def load(cls, model, example_inputs):
        import torch._inductor
        import torch.export._trace
-        from torch.export.dynamic_shapes import _combine_args, _tree_map_with_path
+        from torch.export.dynamic_shapes import _tree_map_with_path

        key = weakref.ref(model)
        if key not in cls.cache:
@ -1428,7 +1428,7 @@ class AOTInductorModelCache:
            else:
                _register_dataclass_output_as_pytree(example_outputs)

-            combined_args = _combine_args(model, example_args, example_kwargs)
+            combined_args = tuple(example_args) + tuple(example_kwargs.values())
            dynamic_shapes = _tree_map_with_path(
                _produce_dynamic_shapes_for_export, combined_args
            )
@ -1449,13 +1449,13 @@ class AOTInductorModelCache:


 def export(model, example_inputs):
-    from torch.export.dynamic_shapes import _combine_args, _tree_map_with_path
+    from torch.export.dynamic_shapes import _tree_map_with_path

    example_args, example_kwargs = _normalize_bench_inputs(example_inputs)
    example_outputs = model(*example_args, **example_kwargs)
    _register_dataclass_output_as_pytree(example_outputs)

-    combined_args = _combine_args(model, example_args, example_kwargs)
+    combined_args = tuple(example_args) + tuple(example_kwargs.values())
    dynamic_shapes = _tree_map_with_path(
        _produce_dynamic_shapes_for_export, combined_args
    )
--- a/benchmarks/dynamo/huggingface.py
+++ b/benchmarks/dynamo/huggingface.py
@ -369,10 +369,7 @@ class HuggingfaceRunner(BenchmarkRunner):
        return self._skip["control_flow"]

    def use_larger_multiplier_for_smaller_tensor(self, name):
-        return name in [
-            "ElectraForQuestionAnswering",
-            "MegatronBertForQuestionAnswering",
-        ]
+        return name in ["ElectraForQuestionAnswering"]

    def _get_model_cls_and_config(self, model_name):
        if model_name not in EXTRA_MODELS:
--- a/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
+++ b/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
@ -6,7 +6,7 @@ add_loop_eager_dynamic,compile_time_instruction_count,5460000000,0.025



-add_loop_inductor,compile_time_instruction_count,27660000000,0.015
+add_loop_inductor,compile_time_instruction_count,27520000000,0.015



@ -22,11 +22,11 @@ basic_modules_ListOfLinears_eager,compile_time_instruction_count,953800000,0.015



-basic_modules_ListOfLinears_inductor,compile_time_instruction_count,17190000000,0.015
+basic_modules_ListOfLinears_inductor,compile_time_instruction_count,17070000000,0.015



-basic_modules_ListOfLinears_inductor_gpu_force_shape_pad,compile_time_instruction_count,15410000000,0.015
+basic_modules_ListOfLinears_inductor_gpu_force_shape_pad,compile_time_instruction_count,15320000000,0.015



@ -42,24 +42,24 @@ sum_floordiv_regression,compile_time_instruction_count,1026000000,0.015



-symint_sum,compile_time_instruction_count,3030000000,0.015
+symint_sum,compile_time_instruction_count,3013000000,0.015



-aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,1989000000,0.015
+aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,1964000000,0.015



-aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,5759000000,0.015
+aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,5672000000,0.015



-aotdispatcher_partitioner_cpu,compile_time_instruction_count,7873000000,0.015
+aotdispatcher_partitioner_cpu,compile_time_instruction_count,7752000000,0.015



-aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3579000000,0.015
+aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3537000000,0.015



-aotdispatcher_training_subclass_cpu,compile_time_instruction_count,9809000000,0.015
+aotdispatcher_training_subclass_cpu,compile_time_instruction_count,9662000000,0.015
--- a/benchmarks/dynamo/timm_models.py
+++ b/benchmarks/dynamo/timm_models.py
@ -127,7 +127,6 @@ REQUIRE_LARGER_MULTIPLIER_FOR_SMALLER_TENSOR = {
    "inception_v3",
    "mobilenetv3_large_100",
    "cspdarknet53",
-    "gluon_inception_v3",
 }


--- a/benchmarks/inductor_backends/cutlass.py
+++ b/benchmarks/inductor_backends/cutlass.py
@ -167,7 +167,7 @@ def get_inputs(

    if op_name == "mm":
        A = torch.randn(M, K, dtype=dtype, device=device)
-        B = torch.randn(N, K, dtype=dtype, device=device).t()
+        B = torch.randn(K, N, dtype=dtype, device=device)
        C = None
        return A, B, C
    else:
--- a/benchmarks/operator_benchmark/benchmark_core.py
+++ b/benchmarks/operator_benchmark/benchmark_core.py
@ -296,7 +296,8 @@ class BenchmarkRunner:
            (key.strip(), value.strip())
            for key, value in map(lambda str: str.split(":"), key_vals)  # noqa: C417
        ]  # ['M: (32, 16)', 'ZPB: 2'] -> [('M', '(32, 16)'), ('ZPB', '2')]
-        out.update(key_vals)
+        for key, value in key_vals:
+            out[key] = value

        return out

--- a/build_variables.bzl
+++ b/build_variables.bzl
@ -859,7 +859,6 @@ libtorch_python_core_sources = [
    "torch/csrc/inductor/aoti_eager/kernel_holder.cpp",
    "torch/csrc/inductor/aoti_eager/kernel_meta_info.cpp",
    "torch/csrc/inductor/resize_storage_bytes.cpp",
-    "torch/csrc/inductor/static_cuda_launcher.cpp",
    "torch/csrc/jit/backends/backend_init.cpp",
    "torch/csrc/jit/python/init.cpp",
    "torch/csrc/jit/passes/onnx.cpp",
--- a/c10/CMakeLists.txt
+++ b/c10/CMakeLists.txt
@ -50,7 +50,7 @@ endif()
        )
 if(NOT BUILD_LIBTORCHLESS)
  add_library(c10 ${C10_SRCS} ${C10_HEADERS})
-  torch_compile_options(c10)
+  target_compile_options_if_supported(c10 "-Wdeprecated")
  if(HAVE_SOVERSION)
    set_target_properties(c10 PROPERTIES
        VERSION ${TORCH_VERSION} SOVERSION ${TORCH_SOVERSION})
--- a/c10/core/Backend.h
+++ b/c10/core/Backend.h
@ -76,7 +76,7 @@ inline Backend dispatchKeyToBackend(DispatchKey t) {
    return Backend::VE;
  } else if (t == DispatchKey::FPGA) {
    return Backend::FPGA;
-  } else if (t == DispatchKey::MAIA || t == DispatchKey::AutogradMAIA) {
+  } else if (t == DispatchKey::MAIA) {
    return Backend::MAIA;
  } else if (t == DispatchKey::XLA || t == DispatchKey::AutogradXLA) {
    return Backend::XLA;
--- a/c10/core/DispatchKey.cpp
+++ b/c10/core/DispatchKey.cpp
@ -32,8 +32,6 @@ const char* toString(BackendComponent t) {
      return "VEBit";
    case BackendComponent::MTIABit:
      return "MTIA";
-    case BackendComponent::MAIABit:
-      return "MAIA";
    case BackendComponent::PrivateUse1Bit:
      return "PrivateUse1Bit";
    case BackendComponent::PrivateUse2Bit:
@ -144,8 +142,6 @@ const char* toString(DispatchKey t) {
      return "AutocastCPU";
    case DispatchKey::AutocastMTIA:
      return "AutocastMTIA";
-    case DispatchKey::AutocastMAIA:
-      return "AutocastMAIA";
    case DispatchKey::AutocastXPU:
      return "AutocastXPU";
    case DispatchKey::AutocastIPU:
@ -303,7 +299,6 @@ c10::DispatchKey parseDispatchKey(const std::string& k) {
      {"Tracer", c10::DispatchKey::Tracer},
      {"AutocastCPU", c10::DispatchKey::AutocastCPU},
      {"AutocastMTIA", c10::DispatchKey::AutocastMTIA},
-      {"AutocastMAIA", c10::DispatchKey::AutocastMAIA},
      {"AutocastXPU", c10::DispatchKey::AutocastXPU},
      {"AutocastIPU", c10::DispatchKey::AutocastIPU},
      {"AutocastHPU", c10::DispatchKey::AutocastHPU},
--- a/Show More
+++ b/Show More