[dynamo] Revert C++-fying of symbolic shape guards

Moving symbolic shape guards to C++ causes compile time issues. This basically boils down to a tradeoff question. For models that have large amount of dynamic shape guards, this flag will help reduce guard latency. But for most of the models, that have a very few dynamic shape guards, the guard lantecy is anyways small. These models will still see a high compile time hit because of calling gcc during the compile. So a good default value seems to be False. We can write a doc to give guidance on reducing guard latency.
2025-11-01 22:14:53 +08:00 · 2025-10-28 11:13:24 -07:00
780 changed files with 8850 additions and 21610 deletions
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -195,16 +195,13 @@ case "$tag" in
    NINJA_VERSION=1.9.0
    TRITON=yes
    ;;
-  pytorch-linux-jammy-xpu-n-py3 | pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks)
+  pytorch-linux-jammy-xpu-n-py3)
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    VISION=yes
    XPU_VERSION=2025.2
    NINJA_VERSION=1.9.0
    TRITON=yes
-    if [[ $tag =~ "benchmarks" ]]; then
-      INDUCTOR_BENCHMARKS=yes
-    fi
    ;;
  pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
    ANACONDA_PYTHON_VERSION=3.10
--- a/.ci/docker/common/install_acl.sh
+++ b/.ci/docker/common/install_acl.sh
@ -3,7 +3,7 @@

 set -eux

-ACL_VERSION=${ACL_VERSION:-"v52.6.0"}
+ACL_VERSION=${ACL_VERSION:-"v25.02"}
 ACL_INSTALL_DIR="/acl"

 # Clone ACL
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@ -49,20 +49,12 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
    export SYSROOT_DEP="sysroot_linux-64=2.17"
  fi

-# Install correct Python version
-# Also ensure sysroot is using a modern GLIBC to match system compilers
-if [ "$ANACONDA_PYTHON_VERSION" = "3.14" ]; then
-  as_jenkins conda create -n py_$ANACONDA_PYTHON_VERSION -y\
-             python="3.14.0" \
-             ${SYSROOT_DEP} \
-             -c conda-forge
-else
  # Install correct Python version
  # Also ensure sysroot is using a modern GLIBC to match system compilers
  as_jenkins conda create -n py_$ANACONDA_PYTHON_VERSION -y\
             python="$ANACONDA_PYTHON_VERSION" \
             ${SYSROOT_DEP}
-fi
+
  # libstdcxx from conda default channels are too old, we need GLIBCXX_3.4.30
  # which is provided in libstdcxx 12 and up.
  conda_install libstdcxx-ng=12.3.0 --update-deps -c conda-forge
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -10,7 +10,7 @@ else
  arch_path='sbsa'
 fi

-NVSHMEM_VERSION=3.4.5
+NVSHMEM_VERSION=3.3.24

 function install_cuda {
  version=$1
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -40,7 +40,11 @@ EOF

    # Default url values
    rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
+    amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu"
+
+    # Add amdgpu repository
    UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
+    echo "deb [arch=amd64] ${amdgpu_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list

    # Add rocm repository
    wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@ -12,8 +12,8 @@ function do_install() {

    rocm_version_nodot=${rocm_version//./}

-    # post merge of https://github.com/icl-utk-edu/magma/pull/65
-    MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f
+    # https://github.com/icl-utk-edu/magma/pull/65
+    MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
    magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"

    rocm_dir="/opt/rocm"
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -138,12 +138,10 @@ numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #test_binary_ufuncs.py
 numpy==1.22.4; python_version == "3.10"
 numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
-numpy==2.1.2; python_version >= "3.13" and python_version < "3.14"
-numpy==2.3.4; python_version >= "3.14"
+numpy==2.1.2; python_version >= "3.13"

 pandas==2.0.3; python_version < "3.13"
-pandas==2.2.3; python_version >= "3.13" and python_version < "3.14"
-pandas==2.3.3; python_version >= "3.14"
+pandas==2.2.3; python_version >= "3.13"

 #onnxruntime
 #Description: scoring engine for Open Neural Network Exchange (ONNX) models
@ -155,8 +153,7 @@ opt-einsum==3.3
 #Pinned versions: 3.3
 #test that import: test_linalg.py

-optree==0.13.0 ; python_version < "3.14"
-optree==0.17.0 ; python_version >= "3.14"
+optree==0.13.0
 #Description: A library for tree manipulation
 #Pinned versions: 0.13.0
 #test that import: test_vmap.py, test_aotdispatch.py, test_dynamic_shapes.py,
@ -255,8 +252,7 @@ scikit-image==0.22.0
 #test that import:

 scipy==1.10.1 ; python_version <= "3.11"
-scipy==1.14.1 ; python_version > "3.11" and python_version < "3.14"
-scipy==1.16.2 ; python_version >= "3.14"
+scipy==1.14.1 ; python_version >= "3.12"
 # Pin SciPy because of failing distribution tests (see #60347)
 #Description: scientific python
 #Pinned versions: 1.10.1
@ -328,8 +324,7 @@ pywavelets==1.7.0 ; python_version >= "3.12"
 #Pinned versions: 1.4.1
 #test that import:

-lxml==5.3.0 ; python_version < "3.14"
-lxml==6.0.2 ; python_version >= "3.14"
+lxml==5.3.0
 #Description: This is a requirement of unittest-xml-reporting

 PyGithub==2.3.0
@ -339,9 +334,7 @@ sympy==1.13.3
 #Pinned versions:
 #test that import:

-onnx==1.19.1 ; python_version < "3.14"
-# Unpin once Python 3.14 is supported. See  onnxruntime issue 26309.
-onnx==1.18.0 ; python_version == "3.14"
+onnx==1.19.1
 #Description: Required by onnx tests, and mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
@ -366,7 +359,7 @@ pwlf==2.2.1
 #test that import: test_sac_estimator.py

 # To build PyTorch itself
-pyyaml==6.0.3
+pyyaml==6.0.2
 pyzstd
 setuptools==78.1.1
 packaging==23.1
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@ -54,15 +54,12 @@ ENV OPENSSL_DIR /opt/openssl
 RUN rm install_openssl.sh

 ARG INDUCTOR_BENCHMARKS
-ARG ANACONDA_PYTHON_VERSION
-ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
 COPY ci_commit_pins/timm.txt timm.txt
-COPY ci_commit_pins/torchbench.txt torchbench.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt

 # Install XPU Dependencies
 ARG XPU_VERSION
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -100,8 +100,6 @@ COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
 COPY ci_commit_pins/timm.txt timm.txt
 COPY ci_commit_pins/torchbench.txt torchbench.txt
-# Only build aoti cpp tests when INDUCTOR_BENCHMARKS is set to True
-ENV BUILD_AOT_INDUCTOR_TEST ${INDUCTOR_BENCHMARKS}
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt

--- a/.ci/lumen_cli/pyproject.toml
+++ b/.ci/lumen_cli/pyproject.toml
@ -6,7 +6,7 @@ dependencies = [
    "GitPython==3.1.45",
    "docker==7.1.0",
    "pytest==7.3.2",
-    "uv==0.9.6"
+    "uv==0.9.5"
 ]

 [tool.setuptools]
--- a/.ci/magma-rocm/Makefile
+++ b/.ci/magma-rocm/Makefile
@ -1,7 +1,7 @@
 SHELL=/usr/bin/env bash

 DOCKER_CMD ?= docker
-DESIRED_ROCM ?= 7.1
+DESIRED_ROCM ?= 7.0
 DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
 PACKAGE_NAME = magma-rocm
 # inherit this from underlying docker image, do not pass this env var to docker
@ -16,7 +16,6 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	magma-rocm/build_magma.sh

 .PHONY: all
-all: magma-rocm71
 all: magma-rocm70
 all: magma-rocm64

@ -25,11 +24,6 @@ clean:
 	$(RM) -r magma-*
 	$(RM) -r output

-.PHONY: magma-rocm71
-magma-rocm71: DESIRED_ROCM := 7.1
-magma-rocm71:
-	$(DOCKER_RUN)
-
 .PHONY: magma-rocm70
 magma-rocm70: DESIRED_ROCM := 7.0
 magma-rocm70:
--- a/.ci/magma-rocm/build_magma.sh
+++ b/.ci/magma-rocm/build_magma.sh
@ -6,8 +6,8 @@ set -eou pipefail
 # The script expects DESIRED_CUDA and PACKAGE_NAME to be set
 ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"

-# post merge of https://github.com/icl-utk-edu/magma/pull/65
-MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f
+# https://github.com/icl-utk-edu/magma/pull/65
+MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec

 # Folders for the build
 PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata
@ -20,7 +20,7 @@ mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RE

 # Fetch magma sources and verify checksum
 pushd ${PACKAGE_DIR}
-git clone https://github.com/icl-utk-edu/magma
+git clone https://github.com/jeffdaily/magma
 pushd magma
 git checkout ${MAGMA_VERSION}
 popd
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -426,7 +426,7 @@ fi
 if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]]; then
  # export test times so that potential sharded tests that'll branch off this build will use consistent data
  # don't do this for libtorch as libtorch is C++ only and thus won't have python tests run on its build
-  PYTHONPATH=. python tools/stats/export_test_times.py
+  python tools/stats/export_test_times.py
 fi
 # don't do this for bazel or s390x or riscv64 as they don't use sccache
 if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *riscv64* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -460,18 +460,28 @@ test_inductor_shard() {
    --verbose
 }

-test_inductor_aoti_cpp() {
+test_inductor_aoti() {
+  # docker build uses bdist_wheel which does not work with test_aot_inductor
+  # TODO: need a faster way to build
  if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
    # We need to hipify before building again
    python3 tools/amd_build/build_amd.py
  fi
  if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then
+    BUILD_COMMAND=(TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python -m pip install --no-build-isolation -v -e .)
    # TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB
    TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="/opt/conda/envs/py_3.10/lib:${TORCH_LIB_DIR}:${LD_LIBRARY_PATH}")
  else
+    BUILD_COMMAND=(python -m pip install --no-build-isolation -v -e .)
    TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}")
  fi

+  # aoti cmake custom command requires `torch` to be installed
+  # initialize the cmake build cache and install torch
+  /usr/bin/env "${BUILD_COMMAND[@]}"
+  # rebuild with the build cache with `BUILD_AOT_INDUCTOR_TEST` enabled
+  /usr/bin/env CMAKE_FRESH=1 BUILD_AOT_INDUCTOR_TEST=1 "${BUILD_COMMAND[@]}"
+
  /usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile
 }

@ -572,8 +582,6 @@ fi

 if [[ "${TEST_CONFIG}" == *cpu* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--device cpu)
-elif [[ "${TEST_CONFIG}" == *xpu* ]]; then
-  DYNAMO_BENCHMARK_FLAGS+=(--device xpu)
 else
  DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
 fi
@ -667,8 +675,6 @@ test_perf_for_dashboard() {
    device=cuda_b200
  elif [[ "${TEST_CONFIG}" == *rocm* ]]; then
    device=rocm
-  elif [[ "${TEST_CONFIG}" == *xpu* ]]; then
-    device=xpu
  fi

  for mode in "${modes[@]}"; do
@ -1761,7 +1767,7 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
  else
    # Do this after checkout_install_torchbench to ensure we clobber any
    # nightlies that torchbench may pull in
-    if [[ "${TEST_CONFIG}" != *cpu* && "${TEST_CONFIG}" != *xpu* ]]; then
+    if [[ "${TEST_CONFIG}" != *cpu* ]]; then
      install_torchrec_and_fbgemm
    fi
    PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id"
@ -1770,7 +1776,7 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
  install_torchvision
  PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
  if [[ "$SHARD_NUMBER" -eq "1" ]]; then
-    test_inductor_aoti_cpp
+    test_inductor_aoti
  fi
 elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
  install_torchvision
--- a/.ci/pytorch/windows/internal/install_python.bat
+++ b/.ci/pytorch/windows/internal/install_python.bat
@ -7,9 +7,12 @@ if "%DESIRED_PYTHON%" == "3.13t" (
    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.13.0/python-3.13.0-amd64.exe"
    set ADDITIONAL_OPTIONS="Include_freethreaded=1"
    set PYTHON_EXEC="python3.13t"
+) else if "%DESIRED_PYTHON%"=="3.14" (
+    echo Python version is set to 3.14 or 3.14t
+    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
 ) else if "%DESIRED_PYTHON%"=="3.14t" (
    echo Python version is set to 3.14 or 3.14t
-    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0-amd64.exe"
+    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
    set ADDITIONAL_OPTIONS="Include_freethreaded=1"
    set PYTHON_EXEC="python3.14t"
 ) else (
--- a/.github/actions/diskspace-cleanup/action.yml
+++ b/.github/actions/diskspace-cleanup/action.yml
@ -27,9 +27,7 @@ runs:
            docker system prune -af
            diskspace_new=$(df -H --output=pcent ${docker_root_dir} | sed -n 2p | sed 's/%//' | sed 's/ //')
            if [[ "$diskspace_new" -gt "$diskspace_cutoff" ]] ; then
-                diskspace_cutoff_int=$((diskspace_cutoff + 0))
-                difference=$((100 - diskspace_cutoff_int))
-                echo "Error: Available diskspace is less than $difference percent. Not enough diskspace."
+                echo "Error: Available diskspace is less than $diskspace_cutoff percent. Not enough diskspace."
                echo "$msg"
                exit 1
            else
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-3b0e7a6f192ca2715e7e6cbe5db007aea7165fe2
+69bbe7363897764f9e758d851cd0340147d27f94
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@ -1 +1 @@
-218d2ab791d437309f91e0486eb9fa7f00badc17
+1752fe6809b74921644866275ab80244b96e80bc
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -540,26 +540,6 @@
  - Lint
  - pull

- name: PrivateUse1
-  patterns:
-  - torch/accelerator/**
-  - torch/utils/backend_registration.py
-  - torch/csrc/acc/**
-  - torch/csrc/DeviceAccelerator.*
-  - torch/csrc/profiler/standalone/privateuse1_observer.*
-  - aten/src/ATen/DeviceAccelerator.*
-  - aten/src/ATen/core/GeneratorForPrivateuseone.*
-  - aten/src/ATen/detail/PrivateUse1HooksInterface.*
-  - docs/source/accelerator/**
-  - test/cpp_extensions/open_registration_extension/torch_openreg/**
-  approved_by:
-  - albanD
-  - fffrog
-  mandatory_checks_name:
-  - EasyCLA
-  - Lint
-  - pull
-
 - name: superuser
  patterns:
  - '*'
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -19,7 +19,6 @@ ciflow_push_tags:
 - ciflow/inductor-perf-test-nightly-rocm-mi300
 - ciflow/inductor-perf-test-nightly-rocm-mi355
 - ciflow/inductor-perf-test-nightly-x86-zen
- ciflow/inductor-perf-test-nightly-xpu
 - ciflow/inductor-periodic
 - ciflow/inductor-rocm
 - ciflow/linux-aarch64
@ -27,7 +26,6 @@ ciflow_push_tags:
 - ciflow/nightly
 - ciflow/op-benchmark
 - ciflow/periodic
- ciflow/periodic-rocm-mi200
 - ciflow/periodic-rocm-mi300
 - ciflow/pull
 - ciflow/quantization-periodic
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -11,17 +11,11 @@ architectures:
    * Latest XPU
 """

-import json
 import os
-import re
-from pathlib import Path
 from typing import Optional


-SCRIPT_DIR = Path(__file__).absolute().parent
-REPO_ROOT = SCRIPT_DIR.parent.parent
-
-
+# NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this
 CUDA_ARCHES = ["12.6", "12.8", "12.9", "13.0"]
 CUDA_STABLE = "12.8"
 CUDA_ARCHES_FULL_VERSION = {
@ -37,7 +31,8 @@ CUDA_ARCHES_CUDNN_VERSION = {
    "13.0": "9",
 }

-ROCM_ARCHES = ["7.0", "7.1"]
+# NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this
+ROCM_ARCHES = ["6.4", "7.0"]

 XPU_ARCHES = ["xpu"]

@ -61,7 +56,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
-        "nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | "
+        "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | "
        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | "
        "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'"
@ -78,7 +73,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
-        "nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | "
+        "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
        "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | "
        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | "
        "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'"
@ -95,7 +90,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
-        "nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | "
+        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | "
        "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | "
        "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | "
        "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'"
@ -112,7 +107,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
        "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | "
-        "nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | "
+        "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | "
        "nvidia-nvtx==13.0.85; platform_system == 'Linux' | "
        "nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | "
        "nvidia-cufile==1.15.1.6; platform_system == 'Linux'"
@ -142,48 +137,9 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
 }


-# Used by tools/nightly.py
-PYTORCH_NIGHTLY_PIP_INDEX_URL = "https://download.pytorch.org/whl/nightly"
-NIGHTLY_SOURCE_MATRIX = {
-    "cpu": dict(
-        name="cpu",
-        index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/cpu",
-        supported_platforms=["Linux", "macOS", "Windows"],
-        accelerator="cpu",
-    )
-}
-CUDA_NIGHTLY_SOURCE_MATRIX = {
-    f"cuda-{major}.{minor}": dict(
-        name=f"cuda-{major}.{minor}",
-        index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/cu{major}{minor}",
-        supported_platforms=["Linux", "Windows"],
-        accelerator="cuda",
-    )
-    for major, minor in (map(int, version.split(".")) for version in CUDA_ARCHES)
-}
-ROCM_NIGHTLY_SOURCE_MATRIX = {
-    f"rocm-{major}.{minor}": dict(
-        name=f"rocm-{major}.{minor}",
-        index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/rocm{major}.{minor}",
-        supported_platforms=["Linux"],
-        accelerator="rocm",
-    )
-    for major, minor in (map(int, version.split(".")) for version in ROCM_ARCHES)
-}
-XPU_NIGHTLY_SOURCE_MATRIX = {
-    "xpu": dict(
-        name="xpu",
-        index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/xpu",
-        supported_platforms=["Linux"],
-        accelerator="xpu",
-    )
-}
-NIGHTLY_SOURCE_MATRIX.update(CUDA_NIGHTLY_SOURCE_MATRIX)
-NIGHTLY_SOURCE_MATRIX.update(ROCM_NIGHTLY_SOURCE_MATRIX)
-NIGHTLY_SOURCE_MATRIX.update(XPU_NIGHTLY_SOURCE_MATRIX)
-
-
 def get_nccl_wheel_version(arch_version: str) -> str:
+    import re
+
    requirements = map(
        str.strip, re.split("[;|]", PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version])
    )
@ -191,14 +147,17 @@ def get_nccl_wheel_version(arch_version: str) -> str:


 def read_nccl_pin(arch_version: str) -> str:
-    nccl_pin_path = (
-        REPO_ROOT
-        / ".ci"
-        / "docker"
-        / "ci_commit_pins"
-        / f"nccl-cu{arch_version[:2]}.txt"
+    from pathlib import Path
+
+    nccl_pin_path = os.path.join(
+        Path(__file__).absolute().parents[2],
+        ".ci",
+        "docker",
+        "ci_commit_pins",
+        f"nccl-cu{arch_version[:2]}.txt",
    )
-    return nccl_pin_path.read_text().strip()
+    with open(nccl_pin_path) as f:
+        return f.read().strip()


 def validate_nccl_dep_consistency(arch_version: str) -> None:
@ -206,8 +165,7 @@ def validate_nccl_dep_consistency(arch_version: str) -> None:
    wheel_ver = get_nccl_wheel_version(arch_version)
    if not nccl_release_tag.startswith(f"v{wheel_ver}"):
        raise RuntimeError(
-            f"{arch_version} NCCL release tag version {nccl_release_tag} "
-            f"does not correspond to wheel version {wheel_ver}"
+            f"{arch_version} NCCL release tag version {nccl_release_tag} does not correspond to wheel version {wheel_ver}"
        )


@ -454,14 +412,7 @@ def generate_wheels_matrix(
    return ret


-arch_version = ""
-for arch_version in CUDA_ARCHES:
-    validate_nccl_dep_consistency(arch_version)
-del arch_version
-
-
-if __name__ == "__main__":
-    # Used by tools/nightly.py
-    (SCRIPT_DIR / "nightly_source_matrix.json").write_text(
-        json.dumps(NIGHTLY_SOURCE_MATRIX, indent=4) + "\n"
-    )
+validate_nccl_dep_consistency("13.0")
+validate_nccl_dep_consistency("12.9")
+validate_nccl_dep_consistency("12.8")
+validate_nccl_dep_consistency("12.6")
--- a/.github/workflows/_xpu-test.yml
+++ b/.github/workflows/_xpu-test.yml
@ -38,10 +38,6 @@ on:
        default: ""
        description: |
          List of tests to include (empty string implies default list)
-      dashboard-tag:
-        required: false
-        type: string
-        default: ""
      disable-monitor:
        description: |
          [Experimental] Disable utilization monitoring for tests.
@ -62,11 +58,6 @@ on:
        required: false
        type: number
        default: 1
-    secrets:
-      HUGGING_FACE_HUB_TOKEN:
-        required: false
-        description: |
-          HF Auth token to avoid rate limits when downloading models or datasets from hub
 permissions:
  id-token: write
  contents: read
@ -205,8 +196,6 @@ jobs:
          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
          PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
          TESTS_TO_INCLUDE: ${{ inputs.tests-to-include }}
-          DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
-          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
        timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
        run: |
          # Fetch aws credential from IMDs
@ -257,8 +246,6 @@ jobs:
            -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
            -e TESTS_TO_INCLUDE \
            -e ZE_AFFINITY_MASK \
-            -e HUGGING_FACE_HUB_TOKEN \
-            -e DASHBOARD_TAG \
            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
            --ulimit stack=10485760:83886080 \
            --ulimit core=0 \
--- a/.github/workflows/build-almalinux-images.yml
+++ b/.github/workflows/build-almalinux-images.yml
@ -36,7 +36,7 @@ jobs:
    runs-on: linux.9xlarge.ephemeral
    strategy:
      matrix:
-        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm7.0", "rocm7.1", "cpu"]
+        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.4", "rocm7.0", "cpu"]
    steps:
      - name: Build docker image
        uses: pytorch/pytorch/.github/actions/binary-docker-build@main
--- a/.github/workflows/build-libtorch-images.yml
+++ b/.github/workflows/build-libtorch-images.yml
@ -52,8 +52,8 @@ jobs:
          { tag: "cuda12.9" },
          { tag: "cuda12.8" },
          { tag: "cuda12.6" },
+          { tag: "rocm6.4"  },
          { tag: "rocm7.0"  },
-          { tag: "rocm7.1"  },
          { tag: "cpu"      },
        ]
    steps:
--- a/.github/workflows/build-magma-rocm-linux.yml
+++ b/.github/workflows/build-magma-rocm-linux.yml
@ -34,7 +34,7 @@ jobs:
      id-token: write
    strategy:
      matrix:
-        rocm_version: ["71", "70"]
+        rocm_version: ["70", "64"]
    steps:
      - name: Checkout PyTorch
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@ -54,8 +54,8 @@ jobs:
          { name: "manylinuxaarch64-builder",       tag: "cuda12.9",          runner: "linux.arm64.2xlarge.ephemeral" },
          { name: "manylinuxaarch64-builder",       tag: "cuda12.8",          runner: "linux.arm64.2xlarge.ephemeral" },
          { name: "manylinuxaarch64-builder",       tag: "cuda12.6",          runner: "linux.arm64.2xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "rocm6.4",           runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "rocm7.0",           runner: "linux.9xlarge.ephemeral" },
-          { name: "manylinux2_28-builder",          tag: "rocm7.1",           runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "cpu",               runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28_aarch64-builder",  tag: "cpu-aarch64",       runner: "linux.arm64.2xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "xpu",               runner: "linux.9xlarge.ephemeral" },
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -55,7 +55,7 @@ jobs:
        docker-image: ["pytorch/manylinux2_28-builder:cpu"]
        include:
          - device: "rocm"
-            rocm_version: "7.1"
+            rocm_version: "7.0"
            runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
          - device: "cuda"
            rocm_version: ""
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -57,7 +57,6 @@ jobs:
          pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
          pytorch-linux-jammy-py3.10-clang12,
          pytorch-linux-jammy-py3.13-clang12,
-          pytorch-linux-jammy-py3.14-clang12,
          pytorch-linux-jammy-rocm-n-py3,
          pytorch-linux-noble-rocm-n-py3,
          pytorch-linux-jammy-rocm-n-py3-benchmarks,
@ -67,7 +66,6 @@ jobs:
          pytorch-linux-jammy-py3.12-halide,
          pytorch-linux-jammy-xpu-n-1-py3,
          pytorch-linux-jammy-xpu-n-py3,
-          pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks,
          pytorch-linux-jammy-py3-clang18-asan,
          pytorch-linux-jammy-py3-clang12-onnx,
          pytorch-linux-jammy-linter,
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -132,7 +132,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -178,7 +178,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -224,7 +224,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -270,7 +270,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -381,7 +381,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -427,7 +427,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -473,7 +473,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -519,7 +519,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -630,7 +630,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -676,7 +676,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -722,7 +722,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -768,7 +768,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -879,7 +879,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -925,7 +925,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -971,7 +971,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1017,7 +1017,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1128,7 +1128,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1174,7 +1174,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1220,7 +1220,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1266,7 +1266,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1377,7 +1377,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1423,7 +1423,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1469,7 +1469,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1515,7 +1515,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1626,7 +1626,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1672,7 +1672,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1718,7 +1718,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1764,7 +1764,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
@ -384,6 +384,124 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

+  libtorch-rocm6_4-shared-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
+      build_name: libtorch-rocm6_4-shared-with-deps-release
+      build_environment: linux-binary-libtorch
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-rocm6_4-shared-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-rocm6_4-shared-with-deps-release-build
+      - get-label-type
+    runs-on: linux.rocm.gpu.mi250
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: libtorch-rocm6_4-shared-with-deps-release
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: configure aws credentials
+        id: aws_creds
+        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+          role-duration-seconds: 18000
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
+          docker-image-name: libtorch-cxx11-builder
+          custom-tag-prefix: rocm6.4
+          docker-build-dir: .ci/docker
+          working-directory: pytorch
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+        env:
+          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+  libtorch-rocm6_4-shared-with-deps-release-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-rocm6_4-shared-with-deps-release-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: "6.4"
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      build_name: libtorch-rocm6_4-shared-with-deps-release
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  libtorch-rocm7_0-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -501,121 +619,3 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
-
-  libtorch-rocm7_1-shared-with-deps-release-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm7.1
-      GPU_ARCH_VERSION: "7.1"
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: libtorch-cxx11-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm7.1
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      timeout-minutes: 300
-      build_name: libtorch-rocm7_1-shared-with-deps-release
-      build_environment: linux-binary-libtorch
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-rocm7_1-shared-with-deps-release-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - libtorch-rocm7_1-shared-with-deps-release-build
-      - get-label-type
-    runs-on: linux.rocm.gpu.mi250
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm7.1
-      GPU_ARCH_VERSION: "7.1"
-      GPU_ARCH_TYPE: rocm
-      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: libtorch-cxx11-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm7.1
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-    permissions:
-      id-token: write
-      contents: read
-    steps:
-      - name: Setup ROCm
-        uses: ./.github/actions/setup-rocm
-      - uses: actions/download-artifact@v4.1.7
-        name: Download Build Artifacts
-        with:
-          name: libtorch-rocm7_1-shared-with-deps-release
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: ROCm set GPU_FLAG
-        run: |
-          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
-      - name: configure aws credentials
-        id: aws_creds
-        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-          aws-region: us-east-1
-          role-duration-seconds: 18000
-      - name: Calculate docker image
-        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
-          docker-image-name: libtorch-cxx11-builder
-          custom-tag-prefix: rocm7.1
-          docker-build-dir: .ci/docker
-          working-directory: pytorch
-      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
-      - name: Test Pytorch binary
-        uses: ./pytorch/.github/actions/test-pytorch-binary
-        env:
-          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
-      - name: Teardown ROCm
-        uses: ./.github/actions/teardown-rocm
-  libtorch-rocm7_1-shared-with-deps-release-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: libtorch-rocm7_1-shared-with-deps-release-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm7.1
-      GPU_ARCH_VERSION: "7.1"
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: libtorch-cxx11-builder
-      DOCKER_IMAGE_TAG_PREFIX: rocm7.1
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      build_name: libtorch-rocm7_1-shared-with-deps-release
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
--- a/.github/workflows/inductor-perf-test-nightly-xpu.yml
+++ b/.github/workflows/inductor-perf-test-nightly-xpu.yml
@ -1,148 +0,0 @@
-name: inductor-perf-nightly-xpu
-
-on:
-  push:
-    tags:
-      - ciflow/inductor-perf-test-nightly-xpu/*
-  schedule:
-    - cron: 30 17 * * *
-  workflow_dispatch:
-    inputs:
-      training:
-        description: Run training (on by default)?
-        required: false
-        type: boolean
-        default: true
-      inference:
-        description: Run inference (on by default)?
-        required: false
-        type: boolean
-        default: true
-      default:
-        description: Run inductor_default?
-        required: false
-        type: boolean
-        default: false
-      dynamic:
-        description: Run inductor_dynamic_shapes?
-        required: false
-        type: boolean
-        default: false
-      cppwrapper:
-        description: Run inductor_cpp_wrapper?
-        required: false
-        type: boolean
-        default: false
-      cudagraphs:
-        description: Run inductor_cudagraphs?
-        required: false
-        type: boolean
-        default: false
-      freezing_cudagraphs:
-        description: Run inductor_cudagraphs with freezing for inference?
-        required: false
-        type: boolean
-        default: false
-      aotinductor:
-        description: Run aot_inductor for inference?
-        required: false
-        type: boolean
-        default: false
-      maxautotune:
-        description: Run inductor_max_autotune?
-        required: false
-        type: boolean
-        default: false
-      benchmark_configs:
-        description: The list of configs used the benchmark
-        required: false
-        type: string
-        default: inductor_huggingface_perf,inductor_timm_perf,inductor_torchbench_perf,cachebench
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
-  cancel-in-progress: true
-
-permissions: read-all
-
-jobs:
-  get-label-type:
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-      opt_out_experiments: lf
-
-  xpu-n-py3_10-inductor-benchmark-build:
-    name: xpu-n-py3.10-inductor-benchmark
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-xpu-n-py3.10
-      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks
-      runner: linux.c7i.12xlarge
-      test-matrix: |
-        { include: [
-          { config: "inductor_huggingface_perf_xpu", shard: 1, num_shards: 5, runner: "linux.idc.xpu" },
-          { config: "inductor_huggingface_perf_xpu", shard: 2, num_shards: 5, runner: "linux.idc.xpu" },
-          { config: "inductor_huggingface_perf_xpu", shard: 3, num_shards: 5, runner: "linux.idc.xpu" },
-          { config: "inductor_huggingface_perf_xpu", shard: 4, num_shards: 5, runner: "linux.idc.xpu" },
-          { config: "inductor_huggingface_perf_xpu", shard: 5, num_shards: 5, runner: "linux.idc.xpu" },
-          { config: "inductor_timm_perf_xpu", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
-          { config: "inductor_timm_perf_xpu", shard: 2, num_shards: 6, runner: "linux.idc.xpu" },
-          { config: "inductor_timm_perf_xpu", shard: 3, num_shards: 6, runner: "linux.idc.xpu" },
-          { config: "inductor_timm_perf_xpu", shard: 4, num_shards: 6, runner: "linux.idc.xpu" },
-          { config: "inductor_timm_perf_xpu", shard: 5, num_shards: 6, runner: "linux.idc.xpu" },
-          { config: "inductor_timm_perf_xpu", shard: 6, num_shards: 6, runner: "linux.idc.xpu" },
-          { config: "inductor_torchbench_perf_xpu", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
-          { config: "inductor_torchbench_perf_xpu", shard: 2, num_shards: 6, runner: "linux.idc.xpu" },
-          { config: "inductor_torchbench_perf_xpu", shard: 3, num_shards: 6, runner: "linux.idc.xpu" },
-          { config: "inductor_torchbench_perf_xpu", shard: 4, num_shards: 6, runner: "linux.idc.xpu" },
-          { config: "inductor_torchbench_perf_xpu", shard: 5, num_shards: 6, runner: "linux.idc.xpu" },
-          { config: "inductor_torchbench_perf_xpu", shard: 6, num_shards: 6, runner: "linux.idc.xpu" },
-        ]}
-    secrets: inherit
-
-  xpu-n-py3_10-inductor-benchmark-test-nightly:
-    permissions:
-      id-token: write
-      contents: read
-    if: github.event_name != 'workflow_dispatch'
-    name: xpu-n-py3.10-inductor-benchmark
-    uses: ./.github/workflows/_xpu-test.yml
-    needs: xpu-n-py3_10-inductor-benchmark-build
-    with:
-      build-environment: linux-jammy-xpu-n-py3.10
-      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-false-cppwrapper-true-aotinductor-true-freezing_cudagraphs-false-cudagraphs_low_precision-false
-      docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
-      test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
-      timeout-minutes: 720
-      # Disable monitor in perf tests for more investigation
-      disable-monitor: true
-      monitor-log-interval: 10
-      monitor-data-collect-interval: 2
-    secrets: inherit
-
-  xpu-n-py3_10-inductor-benchmark-test:
-    permissions:
-      id-token: write
-      contents: read
-    if: github.event_name == 'workflow_dispatch'
-    name: xpu-n-py3.10-inductor-test
-    uses: ./.github/workflows/_xpu-test.yml
-    needs: xpu-n-py3_10-inductor-benchmark-build
-    with:
-      build-environment: linux-jammy-xpu-n-py3.10
-      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
-      docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
-      test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
-      timeout-minutes: 720
-      disable-monitor: false
-      monitor-log-interval: 15
-      monitor-data-collect-interval: 4
-    secrets: inherit
--- a/.github/workflows/periodic-rocm-mi200.yml
+++ b/.github/workflows/periodic-rocm-mi200.yml
@ -1,84 +0,0 @@
-name: periodic-rocm-mi200
-
-on:
-  schedule:
-    # We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs.
-    # Also run less frequently on weekends.
-    - cron: 45 0,8,16 * * 1-5
-    - cron: 45 4 * * 0,6
-    - cron: 45 4,12,20 * * 1-5
-    - cron: 45 12 * * 0,6
-    - cron: 29 8 * * *  # about 1:29am PDT, for mem leak check and rerun disabled tests
-  push:
-    tags:
-      - ciflow/periodic/*
-      - ciflow/periodic-rocm-mi200/*
-    branches:
-      - release/*
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
-  cancel-in-progress: true
-
-permissions:
-  id-token: write
-  contents: read
-
-jobs:
-  llm-td:
-    if: github.repository_owner == 'pytorch'
-    name: before-test
-    uses: ./.github/workflows/llm_td_retrieval.yml
-    permissions:
-      id-token: write
-      contents: read
-
-  target-determination:
-    name: before-test
-    uses: ./.github/workflows/target_determination.yml
-    needs: llm-td
-    permissions:
-      id-token: write
-      contents: read
-
-  get-label-type:
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    if: (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch'
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-
-  linux-jammy-rocm-py3_10-build:
-    name: linux-jammy-rocm-py3.10
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-rocm-py3.10
-      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
-      test-matrix: |
-        { include: [
-          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
-          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
-          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
-        ]}
-    secrets: inherit
-
-  linux-jammy-rocm-py3_10-test:
-    permissions:
-      id-token: write
-      contents: read
-    name: linux-jammy-rocm-py3.10
-    uses: ./.github/workflows/_rocm-test.yml
-    needs:
-      - linux-jammy-rocm-py3_10-build
-      - target-determination
-    with:
-      build-environment: linux-jammy-rocm-py3.10
-      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
-    secrets: inherit
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -204,6 +204,37 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }}
    secrets: inherit

+  linux-jammy-rocm-py3_10-build:
+    name: linux-jammy-rocm-py3.10
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+      test-matrix: |
+        { include: [
+          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
+        ]}
+    secrets: inherit
+
+  linux-jammy-rocm-py3_10-test:
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-jammy-rocm-py3.10
+    uses: ./.github/workflows/_rocm-test.yml
+    needs:
+      - linux-jammy-rocm-py3_10-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
+    secrets: inherit
+
  linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-build:
    name: linux-jammy-cuda12.8-py3-gcc11-slow-gradcheck
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@ -6,7 +6,6 @@ on:
      - pull
      - trunk
      - periodic
-      - periodic-rocm-mi200
      - periodic-rocm-mi300
      - inductor
      - unstable
--- a/.github/workflows/xpu.yml
+++ b/.github/workflows/xpu.yml
@ -59,18 +59,14 @@ jobs:
      runner: linux.c7i.12xlarge
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 12, runner: "linux.idc.xpu" },
-          { config: "default", shard: 2, num_shards: 12, runner: "linux.idc.xpu" },
-          { config: "default", shard: 3, num_shards: 12, runner: "linux.idc.xpu" },
-          { config: "default", shard: 4, num_shards: 12, runner: "linux.idc.xpu" },
-          { config: "default", shard: 5, num_shards: 12, runner: "linux.idc.xpu" },
-          { config: "default", shard: 6, num_shards: 12, runner: "linux.idc.xpu" },
-          { config: "default", shard: 7, num_shards: 12, runner: "linux.idc.xpu" },
-          { config: "default", shard: 8, num_shards: 12, runner: "linux.idc.xpu" },
-          { config: "default", shard: 9, num_shards: 12, runner: "linux.idc.xpu" },
-          { config: "default", shard: 10, num_shards: 12, runner: "linux.idc.xpu" },
-          { config: "default", shard: 11, num_shards: 12, runner: "linux.idc.xpu" },
-          { config: "default", shard: 12, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" },
+          { config: "default", shard: 2, num_shards: 8, runner: "linux.idc.xpu" },
+          { config: "default", shard: 3, num_shards: 8, runner: "linux.idc.xpu" },
+          { config: "default", shard: 4, num_shards: 8, runner: "linux.idc.xpu" },
+          { config: "default", shard: 5, num_shards: 8, runner: "linux.idc.xpu" },
+          { config: "default", shard: 6, num_shards: 8, runner: "linux.idc.xpu" },
+          { config: "default", shard: 7, num_shards: 8, runner: "linux.idc.xpu" },
+          { config: "default", shard: 8, num_shards: 8, runner: "linux.idc.xpu" },
        ]}
    secrets: inherit

--- a/.gitignore
+++ b/.gitignore
@ -143,7 +143,6 @@ scripts/release_notes/*.json
 sccache-stats*.json
 lint.json
 merge_record.json
-.github/scripts/nightly_source_matrix.json

 # These files get copied over on invoking setup.py
 torchgen/packaged/*
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -374,7 +374,7 @@ cmake_dependent_option(
  "Build the lazy Torchscript backend, not compatible with mobile builds" ON
  "NOT INTERN_BUILD_MOBILE" OFF)
 cmake_dependent_option(BUILD_FUNCTORCH "Build Functorch" ON "BUILD_PYTHON" OFF)
-cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin folder"
+cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler"
                       OFF "USE_CUDA" OFF)
 cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON
                        "CPU_AARCH64" OFF)
--- a/README.md
+++ b/README.md
@ -1,4 +1,4 @@
-![PyTorch Logo](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/pytorch-logo-dark.png)
+![PyTorch Logo](https://github.com/pytorch/pytorch/blob/9708fcf92db88b80b9010c68662d634434da3106/docs/source/_static/img/pytorch-logo-dark.png)

 --------------------------------------------------------------------------------

@ -72,7 +72,7 @@ Elaborating Further:

 If you use NumPy, then you have used Tensors (a.k.a. ndarray).

-![Tensor illustration](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/tensor_illustration.png)
+![Tensor illustration](https://github.com/pytorch/pytorch/blob/9708fcf92db88b80b9010c68662d634434da3106/docs/source/_static/img/tensor_illustration.png)

 PyTorch provides Tensors that can live either on the CPU or the GPU and accelerates the
 computation by a huge amount.
@ -99,7 +99,7 @@ from several research papers on this topic, as well as current and past work suc
 While this technique is not unique to PyTorch, it's one of the fastest implementations of it to date.
 You get the best of speed and flexibility for your crazy research.

-![Dynamic graph](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/dynamic_graph.gif)
+![Dynamic graph](https://github.com/pytorch/pytorch/blob/9708fcf92db88b80b9010c68662d634434da3106/docs/source/_static/img/dynamic_graph.gif)

 ### Python First

--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -260,7 +260,7 @@ IF(USE_FBGEMM_GENAI)
  if(USE_CUDA)
    # To avoid increasing the build time/binary size unnecessarily, use an allow-list of kernels to build.
    # If you want to integrate a kernel from FBGEMM into torch, you have to add it here.
-    set(FBGEMM_CUTLASS_KERNELS_REGEX ".*(mx8mx8bf16_grouped|f4f4bf16_grouped|f4f4bf16).*")
+    set(FBGEMM_CUTLASS_KERNELS_REGEX ".*mx8mx8bf16_grouped.*")
    file(GLOB_RECURSE fbgemm_genai_native_cuda_cu
      "${FBGEMM_GENAI_SRCS}/cutlass_extensions/*.cu"
      "${FBGEMM_GENAI_SRCS}/cutlass_extensions/**/*.cu")
@ -291,7 +291,6 @@ IF(USE_FBGEMM_GENAI)

    set(fbgemm_genai_cuh
      "${FBGEMM_GENAI_SRCS}/cutlass_extensions/mx8mx8bf16_grouped/"
-      "${FBGEMM_GENAI_SRCS}/cutlass_extensions/f4f4bf16_grouped/"
      "${FBGEMM_GENAI_SRCS}/"
    )

--- a/aten/src/ATen/core/CachingHostAllocator.h
+++ b/aten/src/ATen/core/CachingHostAllocator.h
@ -677,8 +677,8 @@ struct CachingHostAllocatorImpl {
  // size. This allows us to quickly find a free block of the right size.
  // We use deque to store per size free list and guard the list with its own
  // mutex.
-  alignas(hardware_destructive_interference_size) std::vector<FreeBlockList<B>>
-      free_list_{MAX_SIZE_INDEX};
+  alignas(hardware_destructive_interference_size) std::vector<FreeBlockList<B>> free_list_ =
+      std::vector<FreeBlockList<B>>(MAX_SIZE_INDEX);

  alignas(hardware_destructive_interference_size) std::mutex events_mutex_;
  std::deque<std::pair<E, B*>> events_; // event queue paired with block
--- a/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_bfloat16_neon.h
@ -19,13 +19,6 @@ inline namespace CPU_CAPABILITY {
 #error "Big endian is not supported."
 #endif

-// GCC does not properly optimize bf16 operators
-#if defined(__ARM_FEATURE_BF16) && (__clang_major__ >= 19)
-#define BF16_ARITHMETIC_SUPPORTED() 1
-#else
-#define BF16_ARITHMETIC_SUPPORTED() 0
-#endif
-
 // Unlike the float16_t family of types, bfloat16_t is not available
 // when we're not targeting bfloat16 hardware support on some
 // platforms (but not Mac, so we have to be careful not to shadow the
@ -359,35 +352,18 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
        other, &Vectorized<float>::name);                        \
  }

+  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(abs)
  Vectorized frac() const;
  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(trunc)
  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(sqrt)

 #ifdef __ARM_FEATURE_BF16
-  // Flip sign bit
  Vectorized<c10::BFloat16> neg() const {
-    return vreinterpretq_bf16_s16(vreinterpretq_s16_bf16(values) ^ (-32768));
+    return -values;
  }
-  // Fast reciprocal is fine because we are truncating results
  Vectorized<c10::BFloat16> reciprocal() const {
-    auto x = vcvtq_low_f32_bf16(values);
-    auto y = vcvtq_high_f32_bf16(values);
-    x = vrecpeq_f32(x);
-    y = vrecpeq_f32(y);
-    return vcvtq_high_bf16_f32(vcvtq_low_bf16_f32(x), y);
+    return 1.0f / values;
  }
-  // Clearing the sign bit
-  Vectorized<c10::BFloat16> abs() const {
-    return vreinterpretq_bf16_u16(vreinterpretq_u16_bf16(values) & 0x7FFF);
-  }
-#else
-  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(abs)
-  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(neg)
-  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(reciprocal)
-#endif
-
-// These functions are optimized on clang-21+
-#if BF16_ARITHMETIC_SUPPORTED() && (__clang_major__ >= 21)
  Vectorized<c10::BFloat16> operator==(
      const Vectorized<c10::BFloat16>& other) const {
    return values == other.values;
@ -418,6 +394,8 @@ class Vectorized<c10::BFloat16> : public Vectorized16<
    return values >= other.values;
  }
 #else
+  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(neg)
+  DEFINE_UNARY_ELEMENTWISE_FUNC_VIA_FLOAT_METHOD(reciprocal)
  DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator==)
  DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator!=)
  DEFINE_BINARY_COMPARISON_OPERATOR_VIA_FLOAT_METHOD(operator<)
@ -473,7 +451,7 @@ template <>
 Vectorized<c10::BFloat16> inline operator+(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b) {
-#if BF16_ARITHMETIC_SUPPORTED()
+#ifdef __ARM_FEATURE_BF16
  bfloat16x8_t x = a;
  bfloat16x8_t y = b;
  return x + y;
@ -486,7 +464,7 @@ template <>
 Vectorized<c10::BFloat16> inline operator-(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b) {
-#if BF16_ARITHMETIC_SUPPORTED()
+#ifdef __ARM_FEATURE_BF16
  bfloat16x8_t x = a;
  bfloat16x8_t y = b;
  return x - y;
@ -499,7 +477,7 @@ template <>
 Vectorized<c10::BFloat16> inline operator*(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b) {
-#if BF16_ARITHMETIC_SUPPORTED()
+#ifdef __ARM_FEATURE_BF16
  bfloat16x8_t x = a;
  bfloat16x8_t y = b;
  return x * y;
@ -512,7 +490,7 @@ template <>
 Vectorized<c10::BFloat16> inline operator/(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b) {
-#if BF16_ARITHMETIC_SUPPORTED()
+#ifdef __ARM_FEATURE_BF16
  bfloat16x8_t x = a;
  bfloat16x8_t y = b;
  return x / y;
@ -629,7 +607,7 @@ Vectorized<c10::BFloat16> inline fmadd(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b,
    const Vectorized<c10::BFloat16>& c) {
-#if BF16_ARITHMETIC_SUPPORTED()
+#ifdef __ARM_FEATURE_BF16
  bfloat16x8_t x = a;
  bfloat16x8_t y = b;
  bfloat16x8_t z = c;
@ -649,7 +627,7 @@ Vectorized<c10::BFloat16> inline fnmadd(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b,
    const Vectorized<c10::BFloat16>& c) {
-#if BF16_ARITHMETIC_SUPPORTED()
+#ifdef __ARM_FEATURE_BF16
  bfloat16x8_t x = a;
  bfloat16x8_t y = b;
  bfloat16x8_t z = c;
@ -665,7 +643,7 @@ Vectorized<c10::BFloat16> inline fmsub(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b,
    const Vectorized<c10::BFloat16>& c) {
-#if BF16_ARITHMETIC_SUPPORTED()
+#ifdef __ARM_FEATURE_BF16
  bfloat16x8_t x = a;
  bfloat16x8_t y = b;
  bfloat16x8_t z = c;
@ -681,7 +659,7 @@ Vectorized<c10::BFloat16> inline fnmsub(
    const Vectorized<c10::BFloat16>& a,
    const Vectorized<c10::BFloat16>& b,
    const Vectorized<c10::BFloat16>& c) {
-#if BF16_ARITHMETIC_SUPPORTED()
+#ifdef __ARM_FEATURE_BF16
  bfloat16x8_t x = a;
  bfloat16x8_t y = b;
  bfloat16x8_t z = c;
--- a/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_convert.h
@ -6,9 +6,9 @@ namespace at::vec {
 inline namespace CPU_CAPABILITY {
 #if (defined(__aarch64__) && !defined(CPU_CAPABILITY_SVE256))

-// Enable auto-vectorization for clang-17+
+// Enable auto-vectorization for GCC-13+ and clang-17+
 // GCC-12 has a bug: gcc.gnu.org/bugzilla/show_bug.cgi?id=117001
-#if defined(__clang__) && (__clang_major__ >= 17)
+#if __GNUC__ > 12 || (defined(__clang__) && (__clang_major__ >= 17))

 template <typename from_type, typename to_type>
 inline void convertImpl(
@ -21,46 +21,12 @@ inline void convertImpl(
  }
 }

-template <typename to_type>
-inline void convertFromBool(
-    const bool* __restrict src,
-    to_type* __restrict dst,
-    int64_t n) {
-  const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(src);
-  uint64_t len = static_cast<uint64_t>(n);
-  for (uint64_t i = 0; i < len; i++) {
-    dst[i] = srcPtr[i] != 0 ? static_cast<to_type>(1) : static_cast<to_type>(0);
-  }
-}
-
-template <typename from_type>
-inline void convertToBool(
-    const from_type* __restrict src,
-    bool* __restrict dst,
-    int64_t n) {
-  uint8_t* dstPtr = reinterpret_cast<uint8_t*>(dst);
-  uint64_t len = static_cast<uint64_t>(n);
-  for (uint64_t i = 0; i < len; i++) {
-    dstPtr[i] = src[i] != static_cast<from_type>(0) ? 1 : 0;
-  }
-}
-
 #define CONVERT_TEMPLATE(from_type, to_type)                           \
  template <>                                                          \
  inline void convert(const from_type* src, to_type* dst, int64_t n) { \
    return convertImpl<from_type, to_type>(src, dst, n);               \
  }

-#define CONVERT_FROM_BOOL_TEMPLATE(to_type)                       \
-  inline void convert(const bool* src, to_type* dst, int64_t n) { \
-    return convertFromBool<to_type>(src, dst, n);                 \
-  }
-
-#define CONVERT_TO_BOOL_TEMPLATE(from_type)                         \
-  inline void convert(const from_type* src, bool* dst, int64_t n) { \
-    return convertToBool<from_type>(src, dst, n);                   \
-  }
-
 CONVERT_TEMPLATE(uint8_t, uint8_t)
 CONVERT_TEMPLATE(uint8_t, int8_t)
 CONVERT_TEMPLATE(uint8_t, int16_t)
@ -68,7 +34,6 @@ CONVERT_TEMPLATE(uint8_t, int32_t)
 CONVERT_TEMPLATE(uint8_t, int64_t)
 CONVERT_TEMPLATE(uint8_t, float)
 CONVERT_TEMPLATE(uint8_t, double)
-CONVERT_TO_BOOL_TEMPLATE(uint8_t)
 CONVERT_TEMPLATE(int8_t, uint8_t)
 CONVERT_TEMPLATE(int8_t, int8_t)
 CONVERT_TEMPLATE(int8_t, int16_t)
@ -76,7 +41,6 @@ CONVERT_TEMPLATE(int8_t, int32_t)
 CONVERT_TEMPLATE(int8_t, int64_t)
 CONVERT_TEMPLATE(int8_t, float)
 CONVERT_TEMPLATE(int8_t, double)
-CONVERT_TO_BOOL_TEMPLATE(int8_t)
 CONVERT_TEMPLATE(int16_t, uint8_t)
 CONVERT_TEMPLATE(int16_t, int8_t)
 CONVERT_TEMPLATE(int16_t, int16_t)
@ -84,7 +48,6 @@ CONVERT_TEMPLATE(int16_t, int32_t)
 CONVERT_TEMPLATE(int16_t, int64_t)
 CONVERT_TEMPLATE(int16_t, float)
 CONVERT_TEMPLATE(int16_t, double)
-CONVERT_TO_BOOL_TEMPLATE(int16_t)
 CONVERT_TEMPLATE(int32_t, uint8_t)
 CONVERT_TEMPLATE(int32_t, int8_t)
 CONVERT_TEMPLATE(int32_t, int16_t)
@ -92,7 +55,6 @@ CONVERT_TEMPLATE(int32_t, int32_t)
 CONVERT_TEMPLATE(int32_t, int64_t)
 CONVERT_TEMPLATE(int32_t, float)
 CONVERT_TEMPLATE(int32_t, double)
-CONVERT_TO_BOOL_TEMPLATE(int32_t)
 CONVERT_TEMPLATE(int64_t, uint8_t)
 CONVERT_TEMPLATE(int64_t, int8_t)
 CONVERT_TEMPLATE(int64_t, int16_t)
@ -100,7 +62,6 @@ CONVERT_TEMPLATE(int64_t, int32_t)
 CONVERT_TEMPLATE(int64_t, int64_t)
 CONVERT_TEMPLATE(int64_t, float)
 CONVERT_TEMPLATE(int64_t, double)
-CONVERT_TO_BOOL_TEMPLATE(int64_t)
 CONVERT_TEMPLATE(float, uint8_t)
 CONVERT_TEMPLATE(float, int8_t)
 CONVERT_TEMPLATE(float, int16_t)
@ -108,7 +69,6 @@ CONVERT_TEMPLATE(float, int32_t)
 CONVERT_TEMPLATE(float, int64_t)
 CONVERT_TEMPLATE(float, float)
 CONVERT_TEMPLATE(float, double)
-CONVERT_TO_BOOL_TEMPLATE(float)
 CONVERT_TEMPLATE(double, uint8_t)
 CONVERT_TEMPLATE(double, int8_t)
 CONVERT_TEMPLATE(double, int16_t)
@ -116,14 +76,6 @@ CONVERT_TEMPLATE(double, int32_t)
 CONVERT_TEMPLATE(double, int64_t)
 CONVERT_TEMPLATE(double, float)
 CONVERT_TEMPLATE(double, double)
-CONVERT_TO_BOOL_TEMPLATE(double)
-CONVERT_FROM_BOOL_TEMPLATE(uint8_t)
-CONVERT_FROM_BOOL_TEMPLATE(int8_t)
-CONVERT_FROM_BOOL_TEMPLATE(int16_t)
-CONVERT_FROM_BOOL_TEMPLATE(int32_t)
-CONVERT_FROM_BOOL_TEMPLATE(int64_t)
-CONVERT_FROM_BOOL_TEMPLATE(float)
-CONVERT_FROM_BOOL_TEMPLATE(double)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

 #define CONVERT_FROM_FP16_TEMPLATE(to_type)                            \
@ -155,41 +107,6 @@ CONVERT_TO_FP16_TEMPLATE(int32_t)
 CONVERT_TO_FP16_TEMPLATE(int64_t)
 CONVERT_TO_FP16_TEMPLATE(float)
 CONVERT_TO_FP16_TEMPLATE(double)
-
-inline void convertBoolToFp16Impl(
-    const bool* __restrict src,
-    at::Half* __restrict dst,
-    int64_t n) {
-  const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(src);
-  float16_t* dstPtr = reinterpret_cast<float16_t*>(dst);
-  uint64_t len = static_cast<uint64_t>(n);
-  for (uint64_t i = 0; i < len; i++) {
-    dstPtr[i] = srcPtr[i] != 0 ? 1.0 : 0;
-  }
-}
-
-template <>
-inline void convert(const bool* src, at::Half* dst, int64_t n) {
-  return convertBoolToFp16Impl(src, dst, n);
-}
-
-inline void convertFp16ToBoolImpl(
-    const at::Half* __restrict src,
-    bool* __restrict dst,
-    int64_t n) {
-  const float16_t* srcPtr = reinterpret_cast<const float16_t*>(src);
-  uint8_t* dstPtr = reinterpret_cast<uint8_t*>(dst);
-  uint64_t len = static_cast<uint64_t>(n);
-  for (uint64_t i = 0; i < len; i++) {
-    dstPtr[i] = srcPtr[i] != 0.0 ? 1 : 0;
-  }
-}
-
-template <>
-inline void convert(const at::Half* src, bool* dst, int64_t n) {
-  return convertFp16ToBoolImpl(src, dst, n);
-}
-
 #endif
 #ifdef __ARM_FEATURE_BF16
 CONVERT_TEMPLATE(bfloat16_t, uint8_t)
@ -207,44 +124,6 @@ CONVERT_TEMPLATE(int32_t, bfloat16_t)
 CONVERT_TEMPLATE(int64_t, bfloat16_t)
 CONVERT_TEMPLATE(float, bfloat16_t)
 CONVERT_TEMPLATE(double, bfloat16_t)
-
-inline void convertBoolToBfloat16Impl(
-    const bool* __restrict src,
-    c10::BFloat16* __restrict dst,
-    int64_t n) {
-  const uint8_t* srcPtr = reinterpret_cast<const uint8_t*>(src);
-  uint16_t* dstPtr = reinterpret_cast<uint16_t*>(dst);
-  uint64_t len = static_cast<uint64_t>(n);
-  constexpr uint16_t kBf16One = 0x3f80; // 1.0 in bfloat16
-  for (uint64_t i = 0; i < len; i++) {
-    dstPtr[i] = srcPtr[i] != 0 ? kBf16One : 0;
-  }
-}
-
-template <>
-inline void convert(const bool* src, c10::BFloat16* dst, int64_t n) {
-  return convertBoolToBfloat16Impl(src, dst, n);
-}
-
-inline void convertBfloat16ToBoolImpl(
-    const c10::BFloat16* __restrict src,
-    bool* __restrict dst,
-    int64_t n) {
-  uint8_t* dstPtr = reinterpret_cast<uint8_t*>(dst);
-  const uint16_t* srcPtr = reinterpret_cast<const uint16_t*>(src);
-  uint64_t len = static_cast<uint64_t>(n);
-  for (uint64_t i = 0; i < len; i++) {
-    // Check if all non-sign bits are 0
-    bool isBf16Zero = (srcPtr[i] & 0x7fff) == 0;
-    dstPtr[i] = isBf16Zero ? 0 : 1;
-  }
-}
-
-template <>
-inline void convert(const c10::BFloat16* src, bool* dst, int64_t n) {
-  return convertBfloat16ToBoolImpl(src, dst, n);
-}
-
 #endif

 #endif
--- a/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
+++ b/aten/src/ATen/cpu/vec/vec128/vec128_float_neon.h
@ -309,7 +309,7 @@ class Vectorized<float> {
  DEFINE_SLEEF_COMPATIBLE_UNARY_ELEMENTWISE_FUNC(expm1)
  // Implementation copied from Arm Optimized Routine
  // https://github.com/ARM-software/optimized-routines/blob/master/math/aarch64/advsimd/expf.c
-  inline Vectorized<float> vexpq_f32_u20() const {
+  Vectorized<float> exp_u20() const {
    // bail out to sleef if it's a special case:
    // i.e. there's an input s.t. |input| > 87.3....
    const float32x4_t special_bound = vdupq_n_f32(0x1.5d5e2ap+6f);
@ -348,9 +348,6 @@ class Vectorized<float> {

    return vfmaq_f32(scale, poly, scale);
  }
-  Vectorized<float> exp_u20() const {
-    return vexpq_f32_u20();
-  }
  Vectorized<float> fexp_u20() const {
    return exp_u20();
  }
@ -637,7 +634,7 @@ inline Vectorized<float> Vectorized<float>::erf() const {
  // - exp(- x * x)
  auto pow_2 = (*this) * (*this);
  auto neg_pow_2 = pow_2 ^ neg_zero_vec;
-  auto tmp4 = neg_pow_2.vexpq_f32_u20();
+  auto tmp4 = neg_pow_2.exp();
  auto tmp5 = tmp4 ^ neg_zero_vec;
  // erf(x) = sign(x) * (1 - r * t * exp(- x * x))
  auto tmp6 = t * tmp5;
--- a/aten/src/ATen/cuda/CUDAGreenContext.cpp
+++ b/aten/src/ATen/cuda/CUDAGreenContext.cpp
@ -1,90 +1,78 @@
 #include <ATen/cuda/CUDAGreenContext.h>

-#if defined(CUDA_VERSION) && !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
-#include <c10/cuda/driver_api.h>
-#include <stdexcept>
-#include <vector>
-#define HAS_CUDA_GREEN_CONTEXT() 1
-#else
-#define HAS_CUDA_GREEN_CONTEXT() 0
-// Suppress unsued private field warnings as this class is not supposed to be called
-C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-private-field")
-#endif
-
 namespace at::cuda {
+  GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
+#if CUDA_HAS_GREEN_CONTEXT
+    int driver_version;
+    C10_CUDA_CHECK(cudaDriverGetVersion(&driver_version));
+    TORCH_CHECK(
+        driver_version >= 12080, "cuda driver too old to use green context!");
+    CUcontext pctx = nullptr;
+    C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuCtxGetCurrent_(&pctx));
+    if (C10_UNLIKELY(!pctx)) {
+      TORCH_WARN(
+          "Attempted to create a green context but"
+          " there was no primary context! Creating a primary context...");

-GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
-#if HAS_CUDA_GREEN_CONTEXT()
-  int driver_version;
-  C10_CUDA_CHECK(cudaDriverGetVersion(&driver_version));
-  TORCH_CHECK(
-      driver_version >= 12080, "cuda driver too old to use green context!");
-  CUcontext pctx = nullptr;
-  C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuCtxGetCurrent_(&pctx));
-  if (C10_UNLIKELY(!pctx)) {
-    TORCH_WARN(
-        "Attempted to create a green context but"
-        " there was no primary context! Creating a primary context...");
+      cudaFree(0);
+    }

-    cudaFree(0);
-  }
+    CUdevice device;
+    device_id_ = device_id;
+    C10_CUDA_DRIVER_CHECK(
+        c10::cuda::DriverAPI::get()->cuDeviceGet_(&device, device_id));

-   CUdevice device;
-  device_id_ = device_id;
-  C10_CUDA_DRIVER_CHECK(
-      c10::cuda::DriverAPI::get()->cuDeviceGet_(&device, device_id));
+    // Get device resources
+    CUdevResource device_resource;
+    C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuDeviceGetDevResource_(
+        device, &device_resource, CU_DEV_RESOURCE_TYPE_SM));

-  // Get device resources
-  CUdevResource device_resource;
-  C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuDeviceGetDevResource_(
-      device, &device_resource, CU_DEV_RESOURCE_TYPE_SM));
+    // Split resources
+    std::vector<CUdevResource> result(1);
+    auto result_data = result.data();
+    unsigned int nb_groups = 1;
+    CUdevResource remaining;

-  // Split resources
-  std::vector<CUdevResource> result(1);
-  auto result_data = result.data();
-  unsigned int nb_groups = 1;
-  CUdevResource remaining;
+    C10_CUDA_DRIVER_CHECK(
+        c10::cuda::DriverAPI::get()->cuDevSmResourceSplitByCount_(
+            result_data,
+            &nb_groups,
+            &device_resource,
+            &remaining,
+            0, // default flags
+            num_sms));

-  C10_CUDA_DRIVER_CHECK(
-      c10::cuda::DriverAPI::get()->cuDevSmResourceSplitByCount_(
-          result_data,
-          &nb_groups,
-          &device_resource,
-          &remaining,
-          0, // default flags
-          num_sms));
+    TORCH_CHECK(nb_groups == 1, "Failed to create single resource group");

-  TORCH_CHECK(nb_groups == 1, "Failed to create single resource group");
+    // Generate resource descriptor
+    CUdevResourceDesc desc;
+    C10_CUDA_DRIVER_CHECK(
+        c10::cuda::DriverAPI::get()->cuDevResourceGenerateDesc_(
+            &desc, result_data, 1));

-  // Generate resource descriptor
-  CUdevResourceDesc desc;
-  C10_CUDA_DRIVER_CHECK(
-      c10::cuda::DriverAPI::get()->cuDevResourceGenerateDesc_(
-          &desc, result_data, 1));
+    // Create green context
+    // CU_GREEN_CTX_DEFAULT_STREAM is required per docs:
+    // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GREEN__CONTEXTS.html
+    C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuGreenCtxCreate_(
+        &green_ctx_, desc, device, CU_GREEN_CTX_DEFAULT_STREAM));

-  // Create green context
-  // CU_GREEN_CTX_DEFAULT_STREAM is required per docs:
-  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GREEN__CONTEXTS.html
-  C10_CUDA_DRIVER_CHECK(c10::cuda::DriverAPI::get()->cuGreenCtxCreate_(
-      &green_ctx_, desc, device, CU_GREEN_CTX_DEFAULT_STREAM));
-
-  // Convert to regular context
-  C10_CUDA_DRIVER_CHECK(
-      c10::cuda::DriverAPI::get()->cuCtxFromGreenCtx_(&context_, green_ctx_));
-  TORCH_CHECK(context_, "Green ctx conversion to regular ctx failed!");
+    // Convert to regular context
+    C10_CUDA_DRIVER_CHECK(
+        c10::cuda::DriverAPI::get()->cuCtxFromGreenCtx_(&context_, green_ctx_));
+    TORCH_CHECK(context_, "Green ctx conversion to regular ctx failed!");
 #else
-  TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
+    TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
 #endif
  }

  std::unique_ptr<GreenContext> GreenContext::create(
      uint32_t num_sms,
      std::optional<uint32_t> device_id) {
-#if HAS_CUDA_GREEN_CONTEXT()
+#if CUDA_HAS_GREEN_CONTEXT
    if (!device_id.has_value()) {
      device_id = at::cuda::current_device();
    }
-    return std::unique_ptr<GreenContext>(new GreenContext(device_id.value(), num_sms));
+    return std::make_unique<GreenContext>(device_id.value(), num_sms);
 #else
    TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
 #endif
@ -92,7 +80,7 @@ GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {

  // Implement move operations
  GreenContext::GreenContext(GreenContext&& other) noexcept{
-#if HAS_CUDA_GREEN_CONTEXT()
+#if CUDA_HAS_GREEN_CONTEXT
    device_id_ = std::exchange(other.device_id_, -1);
    green_ctx_ = std::exchange(other.green_ctx_, nullptr);
    context_ = std::exchange(other.context_, nullptr);
@ -103,7 +91,7 @@ GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
  }

  GreenContext& GreenContext::operator=(GreenContext&& other) noexcept{
-#if HAS_CUDA_GREEN_CONTEXT()
+#if CUDA_HAS_GREEN_CONTEXT
    if (this != &other) {
      // Clean up current resources
      if (green_ctx_) {
@ -132,7 +120,7 @@ GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
  }

  GreenContext::~GreenContext() noexcept{
-#if HAS_CUDA_GREEN_CONTEXT()
+#if CUDA_HAS_GREEN_CONTEXT
    C10_CUDA_DRIVER_CHECK(
        c10::cuda::DriverAPI::get()->cuGreenCtxDestroy_(green_ctx_));
 #else
@ -140,9 +128,25 @@ GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
 #endif
  }

+  // Get the underlying CUDA context
+  CUcontext GreenContext::getContext() const {
+#if CUDA_HAS_GREEN_CONTEXT
+    return context_;
+#else
+    TORCH_CHECK(false, "Green Context is only supported on CUDA 12.8+!");
+#endif
+  }
+
+  // Get the underlying green context
+#if CUDA_HAS_GREEN_CONTEXT
+  CUgreenCtx GreenContext::getGreenContext() const {
+    return green_ctx_;
+  }
+#endif
+
  // Make this context current
  void GreenContext::setContext() {
-#if HAS_CUDA_GREEN_CONTEXT()
+#if CUDA_HAS_GREEN_CONTEXT
    auto current_stream = c10::cuda::getCurrentCUDAStream();
    parent_stream_ = current_stream.stream();

@ -171,7 +175,7 @@ GreenContext::GreenContext(uint32_t device_id, uint32_t num_sms) {
  }

  void GreenContext::popContext() {
-#if HAS_CUDA_GREEN_CONTEXT()
+#if CUDA_HAS_GREEN_CONTEXT
    // see above note about stream being hardcoded to the default stream
    at::cuda::CUDAEvent ev;
    ev.record(c10::cuda::getCurrentCUDAStream());
--- a/aten/src/ATen/cuda/CUDAGreenContext.h
+++ b/aten/src/ATen/cuda/CUDAGreenContext.h
@ -1,38 +1,53 @@
 #pragma once
 #include <ATen/cuda/CUDAEvent.h>
-#include <cuda.h>

-// Forward declare green context as opaque ptr
-typedef struct CUgreenCtx_st* CUgreenCtx;
+#if defined(CUDA_VERSION) && !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED)
+#include <c10/cuda/driver_api.h>
+#include <cuda.h>
+#include <memory>
+#include <stdexcept>
+#include <vector>
+#define CUDA_HAS_GREEN_CONTEXT 1
+#else
+#define CUDA_HAS_GREEN_CONTEXT 0
+#endif

 namespace at::cuda {

 class TORCH_CUDA_CPP_API GreenContext {
 public:
-  // Green context creation
-  static std::unique_ptr<GreenContext> create(
-      uint32_t num_sms,
-      std::optional<uint32_t> device_id);
-  ~GreenContext() noexcept;
+  GreenContext(uint32_t device_id, uint32_t num_sms);
+
+  static std::unique_ptr<GreenContext> create(uint32_t num_sms, std::optional<uint32_t> device_id);

  // Delete copy constructor and assignment
  GreenContext(const GreenContext&) = delete;
  GreenContext& operator=(const GreenContext&) = delete;

+  // Implement move operations
+  GreenContext(GreenContext&& other) noexcept;
+  GreenContext& operator=(GreenContext&& other) noexcept;
+  ~GreenContext() noexcept;
+
+  // Get the underlying CUDA context
+  CUcontext getContext() const;
+
+  // Get the underlying green context
+#if CUDA_HAS_GREEN_CONTEXT
+  CUgreenCtx getGreenContext() const;
+#endif
+
  // Make this context current
  void setContext();

  void popContext();

 private:
-  GreenContext(uint32_t device_id, uint32_t num_sms);
-  // Implement move operations
-  GreenContext(GreenContext&& other) noexcept;
-  GreenContext& operator=(GreenContext&& other) noexcept;
-
+#if CUDA_HAS_GREEN_CONTEXT
  int32_t device_id_ = -1;
  CUgreenCtx green_ctx_ = nullptr;
  CUcontext context_ = nullptr;
  cudaStream_t parent_stream_ = nullptr;
+#endif
 };
 } // namespace at::cuda
--- a/aten/src/ATen/cuda/CUDASparse.h
+++ b/aten/src/ATen/cuda/CUDASparse.h
@ -7,6 +7,17 @@
 #endif


+#if defined(USE_ROCM)
+// hipSparse const API added in v2.4.0
+#if HIPSPARSE_VERSION >= 200400
+#define AT_USE_HIPSPARSE_GENERIC_API() 1
+#else
+#define AT_USE_HIPSPARSE_GENERIC_API() 1
+#endif
+#else // USE_ROCM
+#define AT_USE_HIPSPARSE_GENERIC_API() 0
+#endif // USE_ROCM
+
 // cuSparse Generic API spsv function was added in CUDA 11.3.0
 #if defined(CUDART_VERSION) && defined(CUSPARSE_VERSION) && (CUSPARSE_VERSION >= 11500)
 #define AT_USE_CUSPARSE_GENERIC_SPSV() 1
--- a/aten/src/ATen/cuda/Sleep.cu
+++ b/aten/src/ATen/cuda/Sleep.cu
@ -1,7 +1,6 @@
 #include <ATen/cuda/CUDAContextLight.h>
 #include <ATen/cuda/Sleep.h>

-#include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAStream.h>

@ -25,22 +24,8 @@ __global__ void spin_kernel(int64_t cycles) {
 #endif
  }
 }
-
-thread_local int *flag = nullptr;
-
-__global__ void busy_wait_for_flag_kernel(int *flag) {
-  atomicExch(flag, 1);
-  while (atomicAdd(flag, 0) == 1) {
-    // do nothing
-  }
 }

-__global__ void clear_flag_kernel(int *flag) {
-  atomicExch(flag, 0);
-}
-
-} // anonymous namespace
-
 void sleep(int64_t cycles) {
  dim3 grid(1);
  dim3 block(1);
@ -48,26 +33,6 @@ void sleep(int64_t cycles) {
  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }

-void busy_wait_for_flag() {
-  if (!flag) {
-    flag = (int*)c10::cuda::CUDACachingAllocator::raw_alloc(sizeof(int));
-  }
-  dim3 grid(1);
-  dim3 block(1);
-  busy_wait_for_flag_kernel<<<grid, block, 0, c10::cuda::getCurrentCUDAStream()>>>(flag);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-}
-
-void clear_flag() {
-  if (!flag) {
-    flag = (int*)c10::cuda::CUDACachingAllocator::raw_alloc(sizeof(int));
-  }
-  dim3 grid(1);
-  dim3 block(1);
-  clear_flag_kernel<<<grid, block, 0, c10::cuda::getCurrentCUDAStream()>>>(flag);
-  C10_CUDA_KERNEL_LAUNCH_CHECK();
-}
-
 #ifdef USE_ROCM
 __global__ void flush_icache_kernel()
 {
--- a/aten/src/ATen/cuda/Sleep.h
+++ b/aten/src/ATen/cuda/Sleep.h
@ -7,11 +7,6 @@ namespace at::cuda {
 // enqueues a kernel that spins for the specified number of cycles
 TORCH_CUDA_CU_API void sleep(int64_t cycles);

-// enqueues a kernel that spins until a flag is cleared by a
-// corresponding call to clear_flag()
-TORCH_CUDA_CU_API void busy_wait_for_flag();
-TORCH_CUDA_CU_API void clear_flag();
-
 // flushes instruction cache for ROCm; no-op for CUDA
 TORCH_CUDA_CU_API void flush_icache();

--- a/aten/src/ATen/cuda/tunable/Tunable.cpp
+++ b/aten/src/ATen/cuda/tunable/Tunable.cpp
@ -580,7 +580,7 @@ std::ofstream& TuningContext::GetUntunedFile(){
      filename.append(device);
    }

-    untuned_file_ = std::ofstream(filename, std::ios::out | std::ios::app);
+    untuned_file_ = std::ofstream(filename, std::ios::out | std::ios::trunc);
  }
  return untuned_file_;
 }
--- a/aten/src/ATen/detail/MTIAHooksInterface.h
+++ b/aten/src/ATen/detail/MTIAHooksInterface.h
@ -1,6 +1,5 @@
 #pragma once

-#include <c10/core/CachingDeviceAllocator.h>
 #include <c10/core/Device.h>
 #include <c10/util/Exception.h>

@ -152,36 +151,6 @@ struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface {
  }

  virtual bool isAvailable() const override;
-
-  /* MTIAGraph related APIs */
-  virtual int64_t mtiagraphCreate(bool keep_graph = false) const {
-    FAIL_MTIAHOOKS_FUNC(__func__);
-    return -1;
-  }
-
-  virtual void mtiagraphCaptureBegin(int64_t handle, MempoolId_t pool) const {
-    FAIL_MTIAHOOKS_FUNC(__func__);
-  }
-
-  virtual void mtiagraphCaptureEnd(int64_t handle) const {
-    FAIL_MTIAHOOKS_FUNC(__func__);
-  }
-
-  virtual void mtiagraphInstantiate(int64_t handle) const {
-    FAIL_MTIAHOOKS_FUNC(__func__);
-  }
-
-  virtual void mtiagraphReplay(int64_t handle) const {
-    FAIL_MTIAHOOKS_FUNC(__func__);
-  }
-
-  virtual void mtiagraphReset(int64_t handle) const {
-    FAIL_MTIAHOOKS_FUNC(__func__);
-  }
-
-  virtual MempoolId_t mtiagraphPool(int64_t handle) const {
-    FAIL_MTIAHOOKS_FUNC(__func__);
-  }
 };

 struct TORCH_API MTIAHooksArgs {};
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -410,8 +410,8 @@ struct ConvParams {
      return false;
    }
    static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
-    // broken on cuDNN 9.8 - 9.14
-    if (cudnn_version >= 90800 && cudnn_version < 91500) {
+    // broken on cuDNN 9.8
+    if (cudnn_version >= 90800) {
      if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous &&
          (input.scalar_type() == at::kBFloat16 || input.scalar_type() == at::kHalf) &&
          weight.dim() == 5) {
@ -689,10 +689,6 @@ static void check_shape_forward(const at::Tensor& input,
             ", but got bias of size ", at::symint::sizes<T>(bias), " instead");

    for (const auto i : c10::irange(2, k)) {
-      // T could be int64_t or SymInt, Specialized numeric_limts<SymInt> in c10/core/SymInt.h
-      TORCH_CHECK(padding[i-2] <= (std::numeric_limits<T>::max() - padding[i-2]),
-                  "Given padding=", padding[i-2], " at dimension ", i-2, " , expected padding to be at most ",
-                  (std::numeric_limits<T>::max() / 2));
      input_shape.push_back(at::symint::size<T>(input, i) + 2 * padding[i-2]);
      // log new kernel size considering dilation
      kernel_shape.push_back(dilation[i-2] * (weight_sizes[i]-1) + 1);
@ -719,11 +715,6 @@ static void check_shape_forward(const at::Tensor& input,
               "Kernel size: (", kernel_ss.str(), "). Kernel size can't be greater than actual input size");
    }
  } else { // transposed
-    for (const auto i : c10::irange(2, k)) {
-      TORCH_CHECK(padding[i-2] <= (std::numeric_limits<T>::max() - padding[i-2]),
-                  "Given padding=", padding[i-2], " at dimension ", i-2, " , expected padding to be at most ",
-                  (std::numeric_limits<T>::max() / 2));
-    }
    TORCH_CHECK(at::symint::size<T>(input, 1) == weight_sizes[0],
             "Given transposed=", transposed, ", weight of size ", weight_sizes,
             ", expected input", at::symint::sizes<T>(input), " to have ", weight_sizes[0],
--- a/aten/src/ATen/native/ConvolutionTBC.cpp
+++ b/aten/src/ATen/native/ConvolutionTBC.cpp
@ -52,7 +52,8 @@ Tensor conv_tbc(const Tensor& self, const Tensor& weight, const Tensor& bias, in
  for (const auto k : c10::irange(kw)) {
    int iShift = std::max(0, static_cast<int>(k - real_pad));
    int oShift = std::max(0, static_cast<int>(real_pad - k));
-    long t = std::min(ilen + real_pad - k, olen) - oShift;
+    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+    int t = std::min(ilen + real_pad - k, olen) - oShift;
    // Note: gemm assumes column-major matrices
    // input    is l*m (row-major)
    // weight   is m*r (row-major)
--- a/aten/src/ATen/native/IndexingUtils.cpp
+++ b/aten/src/ATen/native/IndexingUtils.cpp
@ -16,7 +16,8 @@ bool canUse32BitIndexMath(const TensorBase& t, int64_t max_elem) {
  auto linearId = elements - 1;

  // NOTE: Assumes all strides are positive, which is true for now
-  for (auto i = t.dim() - 1; i >= 0; --i) {
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  for (int i = t.dim() - 1; i >= 0; --i) {
    auto curDimIndex = linearId % t.sym_size(i);
    auto curDimOffset = curDimIndex * t.sym_stride(i);
    offset += curDimOffset;
--- a/aten/src/ATen/native/QuantizedLinear.cpp
+++ b/aten/src/ATen/native/QuantizedLinear.cpp
@ -68,6 +68,7 @@ Tensor fbgemm_linear_int8_weight_fp32_activation(
  const float* input_ptr = input_contig.const_data_ptr<float>();

  TORCH_CHECK(input.dim() >= 2);
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
  const int64_t M = size_to_dim_(input.dim() - 1, input.sizes());
  const int64_t K = input.size(input.dim() - 1);
  TORCH_CHECK(weight.dim() == 2);
--- a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
@ -160,9 +160,10 @@ struct Dist {
    // value of k.
    parallel_for(0, combs, internal::GRAIN_SIZE / (16 * m), [p, self_start, self_end, n, m, res_start](int64_t k, int64_t end) {
      const Vec pvec(p);
-      double n2 = static_cast<double>(n) - .5;
+      double n2 = n - .5;
      // The -1 accounts for floating point truncation issues
-      int64_t i = static_cast<int64_t>((n2 - std::sqrt(n2 * n2 - 2.0 * static_cast<double>(k) - 1.0)));
+      // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+      int64_t i = static_cast<int64_t>((n2 - std::sqrt(n2 * n2 - 2 * k - 1)));
      int64_t j = k - n * i + i * (i + 1) / 2 + i + 1;

      const scalar_t * self_i = self_start + i * m;
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -170,14 +170,10 @@ static bool isInputCompliesAddmmCudaLt(Tensor& result, const Tensor& self, const
  #if defined(CUDA_VERSION) || defined(USE_ROCM)
  const auto scalar_type = mat1.scalar_type();
  return (beta.toComplexDouble() == 1.0
+    // self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1]
+    // is to use lt interface only when self is bias.
+    && self.dim() == 1 && self.sizes()[0] == mat2_sizes[1] && self.is_contiguous()
    && result.dim() == 2 && result.is_contiguous()
-    // Conditions for bias to be fusable
-    && (
-      self.is_contiguous() &&
-      // NOTE: fine to have 1-len dims to the left from the right-most one
-      (self.dim() == 1 || self.squeeze().dim() == 1) &&
-      self.sizes().back() == mat2_sizes[1]
-    )
    && ( // some dtype restrictions
      #ifndef USE_ROCM
      scalar_type == at::ScalarType::Double ||
--- a/aten/src/ATen/native/cuda/GroupedBlas.cpp
+++ b/aten/src/ATen/native/cuda/GroupedBlas.cpp
@ -208,62 +208,6 @@ _f8_f8_bf16_rowwise_grouped_mm(
 #endif
 }

-Tensor&
-_f4_f4_bf16_grouped_mm_fbgemm(
-      const Tensor& mat_a,
-      const Tensor& mat_b,
-      const Tensor& scale_a,
-      const std::optional<Tensor>& global_scale_a,
-      const Tensor& scale_b,
-      const std::optional<Tensor>& global_scale_b,
-      const std::optional<Tensor>& offs,
-      const std::optional<Tensor>& bias,
-      Tensor& out) {
-#if !defined(USE_ROCM) && defined(USE_FBGEMM_GENAI)
-  // Typing checks
-  TORCH_CHECK_VALUE(mat_a.scalar_type() == at::kFloat4_e2m1fn_x2,
-      "mat_a must be Float4_e2n1fn_2, got: ", mat_a.scalar_type());
-  TORCH_CHECK_VALUE(mat_b.scalar_type() == at::kFloat4_e2m1fn_x2,
-      "mat_b must be Float4_e2n1fn_2, got: ", mat_b.scalar_type());
-
-  std::optional<Tensor> combined_global_scale = std::nullopt;
-  if (global_scale_a.has_value() || global_scale_b.has_value()) {
-      // NVFP4
-      TORCH_CHECK_VALUE(global_scale_a.has_value() && global_scale_b.has_value(),
-          "For NVFP4 grouped gemm both of global_scale_{a,b} must have values")
-      TORCH_CHECK_VALUE(scale_a.scalar_type() == at::kFloat8_e4m3fn,
-          "scale_a must be Float8_e4m3fn, got: ", scale_a.scalar_type());
-      TORCH_CHECK_VALUE(scale_b.scalar_type() == at::kFloat8_e4m3fn,
-          "scale_b must be Float8_e4m3fn, got: ", scale_b.scalar_type());
-      TORCH_CHECK_VALUE(global_scale_a.value().scalar_type() == at::kFloat,
-          "global_scale_a must be Float, got: ", global_scale_a.value().scalar_type());
-      TORCH_CHECK_VALUE(global_scale_b.value().scalar_type() == at::kFloat,
-          "global_scale_b must be Float, got: ", global_scale_b.value().scalar_type());
-      combined_global_scale = global_scale_a.value().mul(global_scale_b.value());
-  } else {
-      // MXFP4
-      TORCH_CHECK_VALUE(scale_a.scalar_type() == at::kFloat8_e8m0fnu,
-          "scale_a must be Float8_e8m0fnu, got: ", scale_a.scalar_type());
-      TORCH_CHECK_VALUE(scale_b.scalar_type() == at::kFloat8_e8m0fnu,
-          "scale_b must be Float8_e8m0fnu, got: ", scale_b.scalar_type());
-  }
-
-  auto o = fbgemm_gpu::f4f4bf16_grouped_mm(
-      mat_a,
-      mat_b,
-      scale_a,
-      scale_b,
-      offs.value(),
-      out,
-      combined_global_scale
-  );
-#else
-  TORCH_CHECK_NOT_IMPLEMENTED(false, "nvfp4 grouped gemm is not supported without USE_FBGEMM_GENAI, and only for CUDA")
-#endif
-
-  return out;
-}
-
 void _check_scales_fp8_rowwise(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) {
  // Checks scales for 2d or 3d target tensors (`mat`).
  if (mat.dim() == 2) {
@ -301,15 +245,7 @@ void _check_scales_fp8_rowwise(const Tensor& mat, const Tensor& scale, const int
  }
 }

-void _check_scales_blocked(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx) {
-  // if {mx,nv}fp4, will need to modify K later
-  bool is_fp4 = (mat.scalar_type() == kFloat4_e2m1fn_x2);
-  int blocksize = 32;
-  // check for nvfp4 vs. mxfp4 to fix blocksize
-  if (is_fp4 && scale.scalar_type() == kFloat8_e4m3fn) {
-    blocksize = 16;
-  }
-
+void _check_scales_mxfp8(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx) {
  // Checks scales for 2d or 3d target tensors (`mat`).
  if (mat.dim() == 2) {
    // For MXFP8, 2d tensors have variable size groups represented as subtensors,
@ -317,19 +253,17 @@ void _check_scales_blocked(const Tensor& mat, const Tensor& scale, const int dim
    // so we can't check the scale sizes without doing a d2h sync to get the group sizes here.
    TORCH_CHECK(
      scale.dim() == mat.dim(),
-      "for block-scaled, scale must have same number of dimensions as parent tensor, but got mat.dim() = ", mat.dim(),
-      " and scale.dim() = ", scale.dim(), " for arg ", arg_idx
-    );
+      "for mxfp8, scale must have same number of dimensions as parent tensor, but got mat.dim() = ", mat.dim(), " and scale.dim() = ", scale.dim(), " for arg ", arg_idx);

-    // LHS mat shape (M, total_K) -> scale shape (rounded_up(M, 128), rounded_up_per_group(K/blocksize, 4))
-    // RHS mat shape (total_K, N) -> scale shape (rounded_up(N, 128), rounded_up_per_group(K/blocksize, 4))
+    // LHS mat shape (M, total_K) -> scale shape (rounded_up(M, 128), rounded_up_per_group(K/32, 4))
+    // RHS mat shape (total_K, N) -> scale shape (rounded_up(N, 128), rounded_up_per_group(K/32, 4))
    //   * weight is transposed prior to the call, scale stays non-transposed.
    bool LHS = arg_idx == 0;
    int scale_dim_to_check = 0;
    int mat_dim_to_check = LHS ? 0 : 1;
    TORCH_CHECK(
        scale.size(scale_dim_to_check) >= mat.size(mat_dim_to_check),
-        "for block-scaled, arg ", arg_idx, " tensor shape (", mat.size(0), ", ", mat.size(1), ") ",
+        "for mxfp8, arg ", arg_idx, " tensor shape (", mat.size(0), ", ", mat.size(1), ") ",
        "must have scale.shape[", scale_dim_to_check, "] >= ", mat.size(mat_dim_to_check), " but got scale.shape=(", scale.size(0), ", ", scale.size(1), ")");
  } else {
    // For MXFP8, 3d tensors have static group sizes (stack of 2d tensors),
@ -339,40 +273,32 @@ void _check_scales_blocked(const Tensor& mat, const Tensor& scale, const int dim
    };

    // TODO: this is for 3d tensor in 2d-3d case specifically.
-    // We'll need to support 3d-3d and 3d-2d cases once mxfp8/nvfp4 grouped gemm supports them.
+    // We'll need to support 3d-3d and 3d-2d cases once mxfp8 grouped gemm supports them.
    int64_t G = mat.size(0);
    int64_t K = mat.size(1);
-    if (is_fp4) {
-      // FP4 packs 2 values into a single 8b word - the "real" K is 2x the
-      // reported K. Reverse that adjustment.
-      const int fp4_elems_per_byte = 2;
-      K *= fp4_elems_per_byte;
-    }
    int64_t N = mat.size(2);
-    int64_t blocked_scale_K = round_up(K/blocksize, 4);
+    int64_t blocked_scale_K = round_up(K/32, 4);
    int64_t blocked_scale_N = round_up(N, 128);

    // fbgemm expects stack of flattened blocked scales for 3d tensor, shape (G, blocked_scale_K * blocked_scale_N).
    TORCH_CHECK(
      scale.dim() == mat.dim() - 1,
-      "for block-scaled 2d-3d grouped GEMM, the 3d tensor of shape (G,K,N) must have a 2d scale of shape (G, blocked_scale_K * blocked_scale_N),",
-      "but scale is ", scale.dim(), "D for arg ", arg_idx
+      "for mxfp8 2d-3d grouped GEMM, the 3d tensor of shape (G,K,N) must have a 2d scale of shape (G, blocked_scale_K * blocked_scale_N), but scale is ", scale.dim(), "D for arg ", arg_idx
    );
    TORCH_CHECK(
      scale.size(0) == G && scale.size(1) == blocked_scale_K * blocked_scale_N,
-      "for block-scaled grouped GEMM, the tensor shape (", G, ", ", K, ", ", N, ") must have scale shape (", G, ",", blocked_scale_K, ",", blocked_scale_N, ")",
-      " for arg ", arg_idx, ", got: ", scale.size(0), ", ", scale.size(1)
+      "for mxfp8, the tensor shape (", G, ", ", K, ", ", N, ") must have scale shape (", G, ",", blocked_scale_K, ",", blocked_scale_N, ") for arg ", arg_idx
    );
  }
 }

 void check_scale(const Tensor& mat, const Tensor& scale, const int dim, const int arg_idx, const int scale_multiplier=1) {
  bool using_fp8_rowwise = scale.scalar_type() == kFloat;
-  bool using_mx = scale.scalar_type() == at::kFloat8_e8m0fnu;
+  bool using_mxfp8 = scale.scalar_type() == at::kFloat8_e8m0fnu;
  if (using_fp8_rowwise) {
    _check_scales_fp8_rowwise(mat, scale, dim, arg_idx, scale_multiplier);
-  } else if (using_mx) {
-    _check_scales_blocked(mat, scale, dim, arg_idx);
+  } else if (using_mxfp8) {
+    _check_scales_mxfp8(mat, scale, dim, arg_idx);
  } else {
    TORCH_CHECK(false, "scale must be float32 or float8_e8m0fnu, but got ", scale.dtype());
  }
@ -485,11 +411,9 @@ namespace {

 using acceptance_fn = std::function<bool(c10::ScalarType, std::vector<ScalingType>&, ArrayRef<Tensor>&, c10::ScalarType, std::vector<ScalingType>&, ArrayRef<Tensor>&)>;

-std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 4> scale_grouped_kernel_dispatch = {{
+std::array<std::tuple<std::string, acceptance_fn, ScaledGemmImplementation>, 2> scale_grouped_kernel_dispatch = {{
  { "rowwise_rowwise", scaled_blas::check_rowwise_recipe, ScaledGemmImplementation::ROWWISE_ROWWISE},
-  { "mxfp8_mxfp8", scaled_blas::check_mxfp8_recipe, ScaledGemmImplementation::MXFP8_MXFP8},
-  { "mxfp4_mxfp4", scaled_blas::check_mxfp4_recipe, ScaledGemmImplementation::MXFP4_MXFP4},
-  { "nvfp4_nvfp4", scaled_blas::check_nvfp4_recipe, ScaledGemmImplementation::NVFP4_NVFP4}}};
+  { "mxfp8_mxfp8", scaled_blas::check_mxfp8_recipe, ScaledGemmImplementation::MXFP8_MXFP8}}};

 } // anonymous namespace

@ -601,9 +525,8 @@ _scaled_grouped_mm_cuda_v2(
          out);
    }
    case ScaledGemmImplementation::MXFP8_MXFP8: {
-      // scale shape checks
-      _check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
-      _check_scales_blocked(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
+      _check_scales_mxfp8(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
+      _check_scales_mxfp8(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
      return _mx8_mx8_bf16_grouped_mm_fbgemm(
          mat_a,
          mat_b,
@ -614,36 +537,6 @@ _scaled_grouped_mm_cuda_v2(
          offs.value(),
          out);
    }
-    case ScaledGemmImplementation::MXFP4_MXFP4: {
-      // scale shape checks
-      _check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
-      _check_scales_blocked(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
-      return _f4_f4_bf16_grouped_mm_fbgemm(
-          mat_a,
-          mat_b,
-          scale_a[0], /* block-scale A */
-          std::nullopt, /* global-scale A */
-          scale_b[0], /* block-scale B */
-          std::nullopt, /* global-scale B */
-          offs.value(),
-          std::nullopt, /* bias */
-          out);
-    }
-    case ScaledGemmImplementation::NVFP4_NVFP4: {
-      // scale shape checks
-      _check_scales_blocked(mat_a, scale_a[0], 0 /* dim */, 0 /* arg_idx */);
-      _check_scales_blocked(mat_b, scale_b[0], 1 /* dim */, 1 /* arg_idx */);
-      return _f4_f4_bf16_grouped_mm_fbgemm(
-          mat_a,
-          mat_b,
-          scale_a[0], /* block-scale A */
-          scale_a[1], /* global-scale A */
-          scale_b[0], /* block-scale B */
-          scale_b[1], /* global-scale B */
-          offs.value(),
-          std::nullopt, /* bias */
-          out);
-    }
    default:
      TORCH_CHECK_NOT_IMPLEMENTED(false,
          "_scaled_grouped_mm_cuda_v2 is in an inconsistent state - should never reach here");
--- a/aten/src/ATen/native/cuda/IndexKernelUtils.cu
+++ b/aten/src/ATen/native/cuda/IndexKernelUtils.cu
@ -13,7 +13,7 @@ __global__ void vectorized_gather_kernel(char * out, char * inp, index_t * idx,
    if (allow_neg_indices) {
        ind = (ind < 0) ? ind + ind_dim_size : ind;
    }
-    CUDA_KERNEL_ASSERT_VERBOSE(ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds", "Expected 0 <= index < ind_dim_size(%ld), but got index = %ld", ind_dim_size, ind);
+    CUDA_KERNEL_ASSERT(ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds");
    int32_t off = (blockDim.x * blockIdx.y + threadIdx.x) * Alignment; // off is guaranteed to be within int32 limits
    if (off >= slice_size) return;
    auto vec = at::native::memory::ld_vec<Alignment>(inp + ind * inp_stride + off);
--- a/aten/src/ATen/native/cuda/RowwiseScaledMM.cu
+++ b/aten/src/ATen/native/cuda/RowwiseScaledMM.cu
@ -54,6 +54,7 @@ namespace {
 using DtypeScale = float;
 using DtypeAccum = float;
 using DtypeEpilogue = float;
+using DtypeOutput = cutlass::bfloat16_t;

 using Multiply = cutlass::epilogue::fusion::Sm90Compute<
    cutlass::multiplies,
@ -67,6 +68,12 @@ using Add = cutlass::epilogue::fusion::Sm90Compute<
    DtypeEpilogue,
    cutlass::FloatRoundStyle::round_to_nearest>;

+using Cast = cutlass::epilogue::fusion::Sm90Compute<
+    cutlass::epilogue::thread::Identity,
+    DtypeOutput,
+    DtypeEpilogue,
+    cutlass::FloatRoundStyle::round_to_nearest>;
+
 template <bool LargeTile, bool FastAccum>
 struct Schedule;

@ -113,8 +120,7 @@ template <
    typename FastAccum,
    typename DtypeA,
    typename DtypeB,
-    typename DtypeBias,
-    typename DtypeOutput>
+    typename DtypeBias>
 void f8f8bf16_rowwise_impl(
    at::Tensor XQ, // FP8
    at::Tensor WQ, // FP8
@ -175,11 +181,6 @@ void f8f8bf16_rowwise_impl(
      WScale,
      cutlass::epilogue::fusion::Sm90EVT<Multiply, XScale, Accum>>;

-  using Cast = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::epilogue::thread::Identity,
-      DtypeOutput,
-      DtypeEpilogue,
-      cutlass::FloatRoundStyle::round_to_nearest>;
  using EpilogueEVT = cutlass::epilogue::fusion::Sm90EVT<
      Cast,
      cutlass::epilogue::fusion::Sm90EVT<
@ -312,8 +313,7 @@ template <
    typename FastAccum,
    typename DtypeA,
    typename DtypeB,
-    typename DtypeBias,
-    typename DtypeOutput>
+    typename DtypeBias>
 void f8f8bf16_rowwise_impl_sm100_sm120(
    at::Tensor XQ, // FP8
    at::Tensor WQ, // FP8
@ -372,11 +372,6 @@ void f8f8bf16_rowwise_impl_sm100_sm120(
      WScale,
      cutlass::epilogue::fusion::Sm90EVT<Multiply, XScale, Accum>>;

-  using Cast = cutlass::epilogue::fusion::Sm90Compute<
-      cutlass::epilogue::thread::Identity,
-      DtypeOutput,
-      DtypeEpilogue,
-      cutlass::FloatRoundStyle::round_to_nearest>;
  using EpilogueEVT = cutlass::epilogue::fusion::Sm90EVT<
      Cast,
      cutlass::epilogue::fusion::Sm90EVT<
@ -503,8 +498,7 @@ template <
    typename FastAccum,
    typename DtypeA,
    typename DtypeB,
-    typename DtypeBias,
-    typename DtypeOutput>
+    typename DtypeBias>
 void f8f8bf16_rowwise_impl_sm89(
    at::Tensor XQ, // FP8
    at::Tensor WQ, // FP8
@ -771,8 +765,7 @@ template <
    typename FastAccum,
    typename DtypeA,
    typename DtypeB,
-    typename DtypeBias,
-    typename DtypeOutput>
+    typename DtypeBias>
 void handle_transposition(
    at::Tensor XQ,
    at::Tensor WQ,
@ -789,8 +782,7 @@ void handle_transposition(
        FastAccum,
        DtypeA,
        DtypeB,
-        DtypeBias,
-        DtypeOutput>(XQ, WQ, x_scale, w_scale, bias, out, swizzle);
+        DtypeBias>(XQ, WQ, x_scale, w_scale, bias, out, swizzle);
  } else {
    dispatch_fp8_rowwise_kernel_on_tile_size<
        ClusterShape,
@ -799,8 +791,7 @@ void handle_transposition(
        FastAccum,
        DtypeB,
        DtypeA,
-        DtypeBias,
-        DtypeOutput>(WQ.t(), XQ.t(), w_scale.t(), x_scale.t(), bias, out.t(), swizzle);
+        DtypeBias>(WQ.t(), XQ.t(), w_scale.t(), x_scale.t(), bias, out.t(), swizzle);
  }
 }

@ -1036,19 +1027,11 @@ void dispatch_fp8_rowwise_kernel_on_bias_dtype(
    at::Tensor out) {
  if (bias.has_value() && bias->dtype() == at::kBFloat16) {
    dispatch_fp8_rowwise_kernel_on_input_dtypes<
-        cutlass::bfloat16_t,
        cutlass::bfloat16_t>
        (XQ, WQ, x_scale, w_scale, bias, use_fast_accum, out);
-  } else if (bias.has_value() && bias->dtype() == at::kHalf){
-    TORCH_CHECK(out.dtype() == at::kHalf, "Output should be Float16 when bias is Float16");
-    dispatch_fp8_rowwise_kernel_on_input_dtypes<
-        cutlass::half_t,
-        cutlass::half_t>
-        (XQ, WQ, x_scale, w_scale, bias, use_fast_accum, out);
  } else {
    dispatch_fp8_rowwise_kernel_on_input_dtypes<
-        float,
-        cutlass::bfloat16_t>
+        float>
        //Types...>
        (XQ, WQ, x_scale, w_scale, bias, use_fast_accum, out);
  }
@ -1090,14 +1073,14 @@ void check_inputs(

  if (bias.has_value()) {
    TORCH_CHECK(bias->device() == b.device());
-    TORCH_CHECK(bias->dtype() == at::kFloat || bias->dtype() == at::kBFloat16 || bias->dtype() == at::kHalf);
+    TORCH_CHECK(bias->dtype() == at::kFloat || bias->dtype() == at::kBFloat16);
    TORCH_CHECK(bias->dim() == 1);
    TORCH_CHECK(bias->size(0) == b.size(1));
    TORCH_CHECK(bias->stride(0) == 1);
  }

  TORCH_CHECK(out.device() == a.device());
-  TORCH_CHECK(out.dtype() == at::kBFloat16 || out.dtype() == at::kHalf);
+  TORCH_CHECK(out.dtype() == at::kBFloat16);
  TORCH_CHECK(out.dim() == 2);
  TORCH_CHECK(out.size(0) == a.size(0));
  TORCH_CHECK(out.size(1) == b.size(1));
--- a/aten/src/ATen/native/cuda/ScaledBlas.cpp
+++ b/aten/src/ATen/native/cuda/ScaledBlas.cpp
@ -59,22 +59,6 @@
 // forward declare
 class cublasCommonArgs;

-namespace fbgemm_gpu {
-
-// NOTE(slayton58): FBGemm_GPU kernels come from <fbgemm_gpu/torch_ops.h> within the FBGemm repo.
-//                  To update supported ops means a submodule bump, which is.. painful. Instead, we
-//                  can simply forward-declare the methods we want to use.. Works at least as a short-term
-//                  thing, but should still be fixed somewhere/somehow.
-at::Tensor f4f4bf16(
-    at::Tensor,
-    at::Tensor,
-    at::Tensor,
-    at::Tensor,
-    std::optional<at::Tensor>,
-    bool use_mx);
-
-} // namespace fbgemm_gpu
-
 using at::blas::ScalingType;
 using at::blas::SwizzleType;

@ -607,7 +591,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
    if ((dprops->major < 9 || CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900)
        // cuBLAS only supports tiled 1D factor layout for 1D block scaling, no 2D block scales
        ||  (dprops->major >= 10 && (!scale_a.sizes().empty() || !scale_b.sizes().empty()))) {
-      TORCH_CHECK_VALUE(out.dtype() == kBFloat16 || out.dtype() == kHalf, "Only bf16 and fp16 high precision output types are supported for row-wise scaling.");
+      TORCH_CHECK_VALUE(out.dtype() == kBFloat16, "Only bf16 high precision output types are supported for row-wise scaling.");
      return _scaled_rowwise_rowwise(
          mat1,
          mat2,
@ -752,7 +736,7 @@ _scaled_rowwise_rowwise(
  if (((dprops->major < 9 || CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900)
      // cuBLAS only supports tiled 1D factor layout for 1D block scaling, no 2D block scales
      ||  (dprops->major == 10 && (scale_a.sizes().size() || scale_b.sizes().size())))) {
-    TORCH_CHECK_VALUE(out.dtype() == kBFloat16 || out.dtype() == kHalf, "Only bf16 and fp16 high precision output types are supported for row-wise scaling.");
+    TORCH_CHECK_VALUE(out.dtype() == kBFloat16, "Only bf16 high precision output types are supported for row-wise scaling.");
    at::cuda::detail::f8f8bf16_rowwise(
        mat_a,
        mat_b,
@ -810,24 +794,6 @@ void _check_deepseek_scale_stride(const Tensor& scale, const Tensor& t, const Sc
  }
 }

-void
-_check_deepseek_support() {
-#ifndef USE_ROCM
-  auto dprops = at::cuda::getCurrentDeviceProperties();
-  if (dprops->major != 9) {
-    // Only on Hopper GPUs
-    TORCH_CHECK_NOT_IMPLEMENTED(
-      dprops->major == 9,
-      "DeepSeek style (1x128, 128x128) scaling only supported in CUDA for SM90")
-  }
-  // Only in cublasLt >= 12.9
-  TORCH_CHECK_NOT_IMPLEMENTED(
-    CUBLAS_VERSION < 120900 || cublasLtGetVersion() < 120900,
-    "DeepSeek style (1x128, 128x128) scaling requires cublasLt >= 12.9"
-  );
-#endif
-}
-
 Tensor&
 _scaled_block1x128_block1x128(
          const Tensor& mat_a, const Tensor& mat_b,
@ -836,12 +802,8 @@ _scaled_block1x128_block1x128(
          const c10::ScalarType out_dtype,
          const bool use_fast_accum,
          Tensor& out) {
-#ifndef USE_ROCM
  // Restrictions:
  // A, B are FP8, scales are fp32, shape K//128
-  // CUDA: Only Hopper GPUs
-  _check_deepseek_support();
-
  TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
      mat_a.scalar_type(), mat_b.scalar_type());
  TORCH_CHECK_VALUE(scale_a.sizes()[0] == mat_a.sizes()[0] && scale_a.sizes()[1] == mat_a.sizes()[1] / 128 && scale_a.scalar_type() == kFloat,
@ -859,12 +821,6 @@ _scaled_block1x128_block1x128(
  _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);

  return out;
-#else
-  TORCH_CHECK_NOT_IMPLEMENTED(
-    false,
-    "1x128 and 128x128 scaling not available with ROCm"
-  );
-#endif
 }

 Tensor&
@ -875,12 +831,10 @@ _scaled_block128x128_block1x128(
          const c10::ScalarType out_dtype,
          const bool use_fast_accum,
          Tensor& out) {
-#ifndef USE_ROCM
  // Restrictions:
  // A, B are FP8, scales are fp32, shape K//128
-  // CUDA: Only Hopper GPUs
-  _check_deepseek_support();
-
+  std::cout << "mat_b: " << mat_b.dim() << ", " << mat_b.sizes() << ", " << mat_b.strides() << std::endl;
+  std::cout << "scale_b: " << scale_b.dim() << ", " << scale_b.sizes() << ", " << scale_b.strides() << std::endl;
  TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
      mat_a.scalar_type(), mat_b.scalar_type());
  TORCH_CHECK_VALUE(scale_a.sizes()[0] == ceil_div<int64_t>(mat_a.sizes()[0], 128) && scale_a.sizes()[1] == ceil_div<int64_t>(mat_a.sizes()[1], 128) && scale_a.scalar_type() == kFloat,
@ -898,12 +852,6 @@ _scaled_block128x128_block1x128(
  _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);

  return out;
-#else
-  TORCH_CHECK_NOT_IMPLEMENTED(
-    false,
-    "1x128 and 128x128 scaling not available with ROCm"
-  );
-#endif
 }

 Tensor&
@ -914,12 +862,8 @@ _scaled_block1x128_block128x128(
          const c10::ScalarType out_dtype,
          const bool use_fast_accum,
          Tensor& out) {
-#ifndef USE_ROCM
  // Restrictions:
  // A, B are FP8, scales are fp32, A: shape K//128, B: K//128, N//128
-  // CUDA: Only Hopper GPUs
-  _check_deepseek_support();
-
  TORCH_CHECK_VALUE(isFloat8Type(mat_a.scalar_type()) && isFloat8Type(mat_b.scalar_type()), "mat_a and mat_b must be fp8 types, got: ",
      mat_a.scalar_type(), mat_b.scalar_type());
  TORCH_CHECK_VALUE(scale_a.sizes()[0] == mat_a.sizes()[0] && scale_a.sizes()[1] == mat_a.sizes()[1] / 128 && scale_a.scalar_type() == kFloat,
@ -937,12 +881,6 @@ _scaled_block1x128_block128x128(
  _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, use_fast_accum, out);

  return out;
-#else
-  TORCH_CHECK_NOT_IMPLEMENTED(
-    false,
-    "1x128 and 128x128 scaling not available with ROCm"
-  );
-#endif
 }

 Tensor&
@ -1013,47 +951,26 @@ _scaled_mxfp4_mxfp4(
          const std::optional<Tensor>& bias,
          const c10::ScalarType out_dtype,
          Tensor& out) {
-#if !defined(USE_ROCM) && !defined(USE_FBGEMM_GENAI)
-  TORCH_CHECK_NOT_IMPLEMENTED(false, "MXFP4 scaling supported on ROCM and CUDA+FBGEMM_GENAI only");
+#ifndef USE_ROCM
+  TORCH_CHECK_NOT_IMPLEMENTED(false, "MXFP4 scaling supported on ROCM only");
 #endif
  // Restrictions:
  // A, B are FP4, scales are e8m0, A: shape K//32, B: K, N//32
  TORCH_CHECK_VALUE(mat_a.scalar_type() == at::kFloat4_e2m1fn_x2 && mat_b.scalar_type() == at::kFloat4_e2m1fn_x2, "mat_a and mat_b must be fp4 types, got: ",
      mat_a.scalar_type(), mat_b.scalar_type());

-  // Packed FP4 format means actual-K = 2 * reported-K -- adjust
-  auto K_multiplier = 2;
-#ifdef USE_ROCM
-  // AMD
-  auto scale_a_elems = ceil_div<int64_t>(K_multiplier * mat_a.size(0), 32) * mat_a.size(1);
-  auto scale_b_elems = ceil_div<int64_t>(K_multiplier * mat_b.size(1), 32) * mat_b.size(0);
-#else
-  // NVIDIA
-  auto scale_a_elems = round_up<int64_t>(mat_a.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(K_multiplier * mat_a.size(1), 32), 4);
-  auto scale_b_elems = round_up<int64_t>(mat_b.size(1), 128) * round_up<int64_t>(ceil_div<int64_t>(K_multiplier * mat_b.size(0), 32), 4);
-#endif
+  auto scale_a_elems = ceil_div<int64_t>(2 * mat_a.size(0), 32) * mat_a.size(1);
+  auto scale_b_elems = ceil_div<int64_t>(2 * mat_b.size(1), 32) * mat_b.size(0);
  TORCH_CHECK_VALUE(scale_a_elems == scale_a.numel(),
         "For Blockwise scaling scale_a should have ", scale_a_elems, " elements, got: ", scale_a.numel());
  TORCH_CHECK_VALUE(scale_b_elems == scale_b.numel(),
         "For Blockwise scaling scale_b should have ", scale_b_elems, " elements, got: ", scale_b.numel());

-#ifdef USE_ROCM
-  // AMD
-  TORCH_CHECK_VALUE(swizzle_a == SwizzleType::NO_SWIZZLE, "scale_a must not be swizzled (NO_SWIZZLE format)");
-  TORCH_CHECK_VALUE(swizzle_b == SwizzleType::NO_SWIZZLE, "scale_b must not be swizzled (NO_SWIZZLE format)");
-#else
-  // NVIDIA
-  TORCH_CHECK_VALUE(swizzle_a == SwizzleType::SWIZZLE_32_4_4, "scale_a must be swizzled to SWIZZLE_32_4_4 format");
-  TORCH_CHECK_VALUE(swizzle_b == SwizzleType::SWIZZLE_32_4_4, "scale_b must be swizzled to SWIZZLE_32_4_4 format");
-#endif
-
  TORCH_CHECK_VALUE(scale_a.is_contiguous() && scale_b.is_contiguous(),
        "For Blockwise scaling both scales should be contiguous");

  TORCH_CHECK_VALUE(out.scalar_type() == out_dtype, "expected out.scalar_type() to be ", out_dtype, ", but got ", out_dtype);

-#ifdef USE_ROCM
-  // AMD
  auto scaling_choice_a = ScalingType::BlockWise1x32;
  auto scaling_choice_b = ScalingType::BlockWise1x32;

@ -1068,29 +985,11 @@ _scaled_mxfp4_mxfp4(
  TORCH_CHECK_VALUE(out.scalar_type() == ScalarType::BFloat16 ||
              out.scalar_type() == ScalarType::Half,
              "Block-wise scaling only supports BFloat16 or Half output types");
+#else
+    TORCH_CHECK_NOT_IMPLEMENTED(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later");
 #endif

  return _scaled_gemm(mat_a, mat_b, scale_a, scale_b, scaling_choice_a, scaling_choice_b, bias, false /* use_fast_accum */, out);
-#else
-  // NVIDIA
-  // NOTE(slayton58): fbgemm_gpu::f4f4bf16 does *not* allow passing an output tensor,
-  //                  but we have one we need to use. Two clear options are to copy into
-  //                  our output (slow), or use a move-assignment-operator (faster).
-  //                  However, the compiler can complain about the explicit move preventing
-  //                  copy elision because the return from f4f4bf16 is a temporary object.
-  //                  So we don't explicitly move, and trust the compiler here...
-  //                  In the longer term this should be fixed on the FBGemm side.
-  out = fbgemm_gpu::f4f4bf16(
-      mat_a,
-      mat_b.transpose(-2, -1),
-      scale_a,
-      scale_b,
-      std::nullopt, /* global_scale */
-      true          /* use_mx */
-  );
-
-  return out;
-#endif
 }

 Tensor&
@ -1215,20 +1114,17 @@ _scaled_mm_cuda_v2_out(
        mat_a.size(0), "x", mat_a.size(1), " and ", mat_b.size(0), "x", mat_b.size(1), ")");
  }

-  // Handle fp4 packed-K dimension
-  int K_multiplier = (mat_a.scalar_type() == ScalarType::Float4_e2m1fn_x2) ? 2 : 1;
-
  TORCH_CHECK_VALUE(!bias || bias->numel() == mat_b.sizes()[1], "Bias must be size ", mat_b.sizes()[1],
       " but got ", bias->numel());
  TORCH_CHECK_VALUE(
-      K_multiplier * mat_a.sizes()[1] % 16 == 0,
+      mat_a.sizes()[1] % 16 == 0,
      "Expected trailing dimension of mat1 to be divisible by 16 ",
      "but got mat1 shape: (",
      mat_a.sizes()[0],
      "x",
-      K_multiplier * mat_a.sizes()[1],
+      mat_a.sizes()[1],
      ").");
-  TORCH_CHECK_VALUE(K_multiplier * mat_b.sizes()[0] % 16 == 0 && mat_b.sizes()[1] % 16 == 0, "mat2 shape (", mat_b.sizes()[0], "x",
+  TORCH_CHECK_VALUE(mat_b.sizes()[0] % 16 == 0 && mat_b.sizes()[1] % 16 == 0, "mat2 shape (", mat_b.sizes()[0], "x",
       mat_b.sizes()[1], ") must be divisible by 16");

  // TODO(slayton): Existing checks, not sure if they should really be here.
--- a/aten/src/ATen/native/cuda/ScatterGatherKernel.cu
+++ b/aten/src/ATen/native/cuda/ScatterGatherKernel.cu
@ -160,8 +160,8 @@ struct _cuda_scatter_gather_internal_kernel {
      auto offsets = offset_calc.get(i);

      int64_t idx_dim = *(index_t*)(index_ptr + offsets[2]);
-      CUDA_KERNEL_ASSERT_VERBOSE(idx_dim >= 0 && idx_dim < index_size
-        && "scatter gather kernel index out of bounds", "Expected 0 <= idx_dim < index_size (%ld), but got idx_dim = %ld", index_size, idx_dim);
+      CUDA_KERNEL_ASSERT(idx_dim >= 0 && idx_dim < index_size
+        && "scatter gather kernel index out of bounds");

      f(
        (scalar_t*)(self_ptr + offsets[0]),
@ -406,8 +406,9 @@ struct _cuda_scatter_fill_internal_kernel {
      auto offsets = offset_calc.get(i);

      int64_t idx_dim = *(index_t*)(index_ptr + offsets[1]);
-      CUDA_KERNEL_ASSERT_VERBOSE(idx_dim >= 0 && idx_dim < index_size
-        && "index out of bounds", "Expected 0 <= idx_dim < index_size (%ld), but got idx_dim = %ld", index_size, idx_dim);
+      CUDA_KERNEL_ASSERT(idx_dim >= 0 && idx_dim < index_size
+        && "index out of bounds"
+      );

      f(
        (scalar_t*)(self_ptr + offsets[0]),
--- a/aten/src/ATen/native/cuda/UnaryGeometricTanKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricTanKernel.cu
@ -12,15 +12,14 @@

 namespace at::native {

-#if 0 && AT_USE_JITERATOR()
+#if AT_USE_JITERATOR()
 constexpr char tan_name[] = "tan_impl";
 #endif

 void tan_kernel_cuda(TensorIteratorBase& iter) {
  auto common_dtype = iter.common_dtype();
  if (at::isComplexType(common_dtype)) {
-    // Disabled due to accuracy issues
-#if 0 && AT_USE_JITERATOR()
+#if AT_USE_JITERATOR()
    static const auto tan_string = jiterator_stringify(
        template <typename T> T tan_impl(T a) { return std::tan(a); });
    AT_DISPATCH_COMPLEX_TYPES_AND(
--- a/aten/src/ATen/native/cuda/UnaryGeometricTanhKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryGeometricTanhKernel.cu
@ -12,15 +12,14 @@

 namespace at::native {

-#if 0 && AT_USE_JITERATOR()
+#if AT_USE_JITERATOR()
 constexpr char tanh_name[] = "tanh_impl";
 #endif

 void tanh_kernel_cuda(TensorIteratorBase& iter) {
  auto common_dtype = iter.common_dtype();
  if (at::isComplexType(common_dtype)) {
-    // Disabled due to accuracy issues
-#if 0 && AT_USE_JITERATOR()
+#if AT_USE_JITERATOR()
    static const auto tanh_string = jiterator_stringify(
        template <typename T> T tanh_impl(T a) { return std::tanh(a); });
    AT_DISPATCH_COMPLEX_TYPES_AND(
--- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu
+++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
@ -141,8 +141,7 @@ WelfordDataLN cuWelfordOnlineSum(
  if constexpr (!rms_norm){
    U delta = val - curr_sum.mean;
    U new_count = curr_sum.count + 1.f;
-//Due to low CU count, we run into accuracy issues on gfx90a with `__builtin_amdgcn_rcpf`
-#if defined(USE_ROCM) && !defined(__gfx90a__) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
+#if defined(USE_ROCM) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
    U new_mean = curr_sum.mean + delta * __builtin_amdgcn_rcpf(new_count);
 #else
    U new_mean = curr_sum.mean + delta * (1.f/new_count); //proper division is slow, this is less accurate but noticeably faster
@ -164,8 +163,7 @@ WelfordDataLN cuWelfordCombine(
    U count = dataA.count + dataB.count;
    U mean, sigma2;
    if (count > decltype(dataB.count){0}) {
-//Due to low CU count, we run into accuracy issues on gfx90a with `__builtin_amdgcn_rcpf`
-#if defined(USE_ROCM) && !defined(__gfx90a__) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
+#if defined(USE_ROCM) && defined(USE_LAYERNORM_FAST_RECIPROCAL)
      auto coef = __builtin_amdgcn_rcpf(count);
 #else
      auto coef = 1.f/count; //NB we don't use --use_fast_math, but this is emulation, 1./count goes to intrinsic, `* coef` is multiplication, instead of slow fp division
--- a/aten/src/ATen/native/mkldnn/xpu/Attention.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/Attention.cpp
@ -40,37 +40,14 @@ bool check_head_dim_size_xpu(sdp::sdp_params const& params, bool debug) {
  return true;
 }

-bool input_require_grad(
-    const at::Tensor& query,
-    const at::Tensor& key,
-    const at::Tensor& value,
-    const std::optional<at::Tensor>& attn_mask) {
-  return at::GradMode::is_enabled() &&
-      (query.requires_grad() || key.requires_grad() || value.requires_grad() ||
-       (attn_mask.has_value() && attn_mask.value().requires_grad()));
-}
-
-bool check_grad(sdp::sdp_params const& params, bool debug) {
-  if (!input_require_grad(
-          params.query, params.key, params.value, params.attn_mask))
-    return true;
-
-  auto q_num_heads = params.query.sym_size(-3);
-  auto k_num_heads = params.key.sym_size(-3);
-  auto v_num_heads = params.value.sym_size(-3);
-  bool is_gqa = q_num_heads != k_num_heads || q_num_heads != v_num_heads;
-  if (debug && is_gqa)
-    TORCH_WARN(
-        "scale_dot_product_attention with gqa is not supported for gradient computation on xpu.");
-
-  bool attn_mask_needs_grad =
-      params.attn_mask.has_value() && params.attn_mask.value().requires_grad();
-  if (debug && attn_mask_needs_grad) {
-    TORCH_WARN(
-        "scale_dot_product_attention on xpu is not supported when attn_mask.requires_grad() == True.");
+bool check_no_grad(sdp::sdp_params const& params, bool debug) {
+  const bool any_inputs_require_grad = params.query.requires_grad() ||
+      params.key.requires_grad() || params.value.requires_grad();
+  const bool gradmode_enabled = at::GradMode::is_enabled();
+  if (debug && any_inputs_require_grad && gradmode_enabled) {
+    TORCH_WARN("Backward or grad to be supported.");
  }
-
-  return !is_gqa && !attn_mask_needs_grad;
+  return !any_inputs_require_grad || !gradmode_enabled;
 }

 bool can_use_overrideable_attention(sdp::sdp_params const& params, bool debug) {
@ -88,7 +65,7 @@ bool can_use_overrideable_attention(sdp::sdp_params const& params, bool debug) {
      sdp::check_nonzero_sequence_lengths_dense,
      sdp::check_last_dim_stride_equals_1_dense<false /*ignore_singleton_dim*/>,
      check_head_dim_size_xpu,
-      check_grad);
+      check_no_grad);
  for (auto& constraint : constraints) {
    if (!constraint(params, debug)) {
      return false;
@ -248,11 +225,10 @@ _scaled_dot_product_fused_attention_overrideable_xpu(
    double dropout_p,
    bool is_causal,
    bool return_debug_mask,
-    std::optional<double> scale,
-    bool compute_logsumexp) {
+    std::optional<double> scale) {
  TORCH_INTERNAL_ASSERT(
      query.dim() == 4 && key.dim() == 4 && value.dim() == 4,
-      "scaled_dot_product_fused_attention_overrideable_xpu: Accept only 4 dims inputs shape of {B, H, T, K}");
+      "scaled_dot_product_fused_attention_overrideable_xpu: Accept only 4 dims inputs shape of {(B), H, T, K}");
  TORCH_INTERNAL_ASSERT(
      (key.size(0) == value.size(0)) && (key.size(1) == value.size(1)) &&
          (key.size(2) == value.size(2)),
@ -269,9 +245,6 @@ _scaled_dot_product_fused_attention_overrideable_xpu(
  TORCH_INTERNAL_ASSERT(
      !(attn_bias.has_value() && is_causal),
      "scaled_dot_product_fused_attention_overrideable_xpu: attn_bias cannot present with is_causal");
-  TORCH_INTERNAL_ASSERT(
-      !(attn_bias.has_value() && attn_bias.value().requires_grad()),
-      "scaled_dot_product_fused_attention_overrideable_xpu: attn_bias cannot have requires_grad=True");

  const int64_t batch_size = query.size(0);
  const int64_t num_head_q = query.size(1);
@ -281,14 +254,11 @@ _scaled_dot_product_fused_attention_overrideable_xpu(
  const int64_t seq_len_q = query.size(2);
  const int64_t seq_len_kv = key.size(2);

-  at::Tensor attention;
-  std::vector<int64_t> attention_shape = {
+  at::Tensor output;
+  std::vector<int64_t> output_shape = {
      batch_size, num_head_q, seq_len_q, head_dim_v};
-  alloc_with_matching_layout(query, attention, attention_shape);
-
-  auto opts = query.options();
-  at::Tensor logsumexp =
-      at::empty({batch_size, num_head_q, seq_len_q}, opts.dtype(at::kFloat));
+  alloc_with_matching_layout(query, output, output_shape);
+  at::Tensor logsumexp, debug_attn_mask; // not supported

  at::native::onednn::sdpa(
      batch_size,
@ -304,15 +274,15 @@ _scaled_dot_product_fused_attention_overrideable_xpu(
      attn_bias,
      is_causal,
      scale.has_value() ? scale.value() : (1.0 / std::sqrt(head_dim_qk)),
-      attention,
-      compute_logsumexp,
+      output,
+      false,
      logsumexp);

  // rng not used
  auto philox_seed = at::empty({}, at::dtype(at::kLong));
  auto philox_offset = at::empty({}, at::dtype(at::kLong));
  return std::make_tuple(
-      attention,
+      output,
      logsumexp,
      /* cum_seq_q */ at::Tensor(),
      /* cum_seq_k */ at::Tensor(),
@ -320,106 +290,7 @@ _scaled_dot_product_fused_attention_overrideable_xpu(
      seq_len_kv,
      philox_seed,
      philox_offset,
-      /*debug_attn_mask */ at::Tensor());
-}
-
-std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
-_scaled_dot_product_fused_attention_overrideable_backward_xpu(
-    const at::Tensor& grad_out,
-    const at::Tensor& query,
-    const at::Tensor& key,
-    const at::Tensor& value,
-    const at::Tensor& attn_bias,
-    std::array<bool, 4> grad_input_mask,
-    const at::Tensor& out,
-    const at::Tensor& logsumexp,
-    const at::Tensor& cum_seq_q,
-    const at::Tensor& cum_seq_k,
-    int64_t max_q,
-    int64_t max_k,
-    double dropout_p,
-    bool is_causal,
-    const at::Tensor& philox_seed,
-    const at::Tensor& philox_offset,
-    std::optional<double> scale) {
-  TORCH_INTERNAL_ASSERT(
-      grad_out.dim() == 4 && out.dim() == 4 &&
-          grad_out.size(0) == out.size(0) && grad_out.size(1) == out.size(1) &&
-          grad_out.size(2) == out.size(2) && grad_out.size(3) == out.size(3),
-      "scaled_dot_product_fused_attention_overrideable_backward_xpu: grad_out and out should have the same shape of {B, H, T, K}");
-  TORCH_INTERNAL_ASSERT(
-      query.dim() == 4 && key.dim() == 4 && value.dim() == 4,
-      "scaled_dot_product_fused_attention_overrideable_backward_xpu: Accept only 4 dims inputs shape of {B, H, T, K}");
-  TORCH_INTERNAL_ASSERT(
-      (key.size(0) == value.size(0)) && (key.size(1) == value.size(1)) &&
-          (key.size(2) == value.size(2)),
-      "scaled_dot_product_fused_attention_overrideable_backward_xpu: K/V should have the same batch / seq / num_head");
-  TORCH_INTERNAL_ASSERT(
-      query.size(0) == grad_out.size(0) && query.size(1) == grad_out.size(1) &&
-          query.size(2) == grad_out.size(2),
-      "scaled_dot_product_fused_attention_overrideable_backward_xpu: Q should have the same batch / num_head / seq_len as grad_out");
-  TORCH_INTERNAL_ASSERT(
-      query.size(3) == key.size(3),
-      "scaled_dot_product_fused_attention_overrideable_backward_xpu: Q/K should have the same head_dim");
-  TORCH_INTERNAL_ASSERT(
-      value.size(3) == grad_out.size(3),
-      "scaled_dot_product_fused_attention_overrideable_backward_xpu: V should have the same head_dim as grad_out");
-  TORCH_INTERNAL_ASSERT(
-      query.size(1) == key.size(1),
-      "scaled_dot_product_fused_attention_overrideable_backward_xpu: number of heads in K/V must equal to number of heads in Q");
-  TORCH_INTERNAL_ASSERT(
-      dropout_p == 0.0,
-      "scaled_dot_product_fused_attention_overrideable_backward_xpu: Currently do not support dropout > 0");
-  TORCH_INTERNAL_ASSERT(
-      logsumexp.dim() == 3 && logsumexp.size(0) == query.size(0) &&
-      logsumexp.size(1) == query.size(1) &&
-      logsumexp.size(2) == query.size(2) &&
-      "scaled_dot_product_fused_attention_overrideable_backward_xpu: logsumexp should have the shape of {B, H, T}");
-
-  std::optional<Tensor> attn_bias_opt;
-  if (attn_bias.defined()) {
-    attn_bias_opt = attn_bias;
-  }
-
-  const int64_t batch_size = query.size(0);
-  const int64_t num_head_q = query.size(1);
-  const int64_t num_head_kv = key.size(1);
-  const int64_t seq_len_q = query.size(2);
-  const int64_t seq_len_kv = key.size(2);
-  const int64_t head_dim_qk = query.size(3);
-  const int64_t head_dim_v = value.size(3);
-
-  auto grad_q = at::empty_like(query);
-  auto grad_k = at::empty_like(key);
-  auto grad_v = at::empty_like(value);
-  auto grad_attn_bias = attn_bias_opt.has_value()
-      ? at::empty_like(attn_bias_opt.value())
-      : at::Tensor();
-  at::native::onednn::sdpa_backward(
-      batch_size,
-      num_head_q,
-      num_head_kv,
-      seq_len_q,
-      seq_len_kv,
-      head_dim_qk,
-      head_dim_v,
-      grad_out,
-      query,
-      key,
-      value,
-      out,
-      logsumexp,
-      attn_bias_opt,
-      is_causal,
-      scale.has_value() ? scale.value() : (1.0 / std::sqrt(query.size(3))),
-      grad_q,
-      grad_k,
-      grad_v);
-  return std::make_tuple(
-      std::move(grad_q),
-      std::move(grad_k),
-      std::move(grad_v),
-      std::move(grad_attn_bias));
+      debug_attn_mask);
 }

 REGISTER_XPU_DISPATCH(_fused_sdp_choice_stub, &_fused_sdp_choice_xpu);
--- a/aten/src/ATen/native/mps/kernels/BinaryKernel.metal
+++ b/aten/src/ATen/native/mps/kernels/BinaryKernel.metal
@ -86,28 +86,6 @@ struct zeta_functor {
  }
 };

-struct logaddexp_functor {
-  template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
-  inline T operator()(const T a, const T b) {
-    return c10::metal::logaddexp(a, b);
-  }
-  template <typename T, enable_if_t<is_integral_v<T>, bool> = true>
-  inline float operator()(const T a, const T b) {
-    return c10::metal::logaddexp(float(a), float(b));
-  }
-};
-
-struct logaddexp2_functor {
-  template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
-  inline T operator()(const T a, const T b) {
-    return c10::metal::logaddexp2(a, b);
-  }
-  template <typename T, enable_if_t<is_integral_v<T>, bool> = true>
-  inline float operator()(const T a, const T b) {
-    return c10::metal::logaddexp2(float(a), float(b));
-  }
-};
-
 struct xlog1py_functor {
  template <typename T, enable_if_t<is_floating_point_v<T>, bool> = true>
  inline T operator()(const T a, const T b) {
@ -399,10 +377,6 @@ REGISTER_FLOAT_BINARY_OP(fmin);
 REGISTER_FLOAT_BINARY_OP(nextafter);
 REGISTER_FLOAT_BINARY_OP(zeta);
 REGISTER_INT2FLOAT_BINARY_OP(zeta);
-REGISTER_FLOAT_BINARY_OP(logaddexp);
-REGISTER_INT2FLOAT_BINARY_OP(logaddexp);
-REGISTER_FLOAT_BINARY_OP(logaddexp2);
-REGISTER_INT2FLOAT_BINARY_OP(logaddexp2);
 REGISTER_FLOAT_BINARY_OP(xlog1py);
 REGISTER_INT2FLOAT_BINARY_OP(xlog1py);
 REGISTER_FLOAT_BINARY_OP(chebyshev_polynomial_t);
@ -489,8 +463,6 @@ REGISTER_BINARY_OP(add, float2, float2);
 REGISTER_BINARY_OP(add, half2, half2);
 REGISTER_BINARY_OP(sub, float2, float2);
 REGISTER_BINARY_OP(sub, half2, half2);
-REGISTER_BINARY_OP(logaddexp, float2, float2);
-REGISTER_BINARY_OP(logaddexp, half2, half2);
 REGISTER_BINARY_ALPHA_OP(add_alpha, float2, float2, float2);
 REGISTER_BINARY_ALPHA_OP(add_alpha, half2, half2, half2);
 REGISTER_BINARY_ALPHA_OP(sub_alpha, float2, float2, float2);
--- a/aten/src/ATen/native/mps/operations/BinaryKernel.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryKernel.mm
@ -89,14 +89,6 @@ static void zeta_mps_kernel(TensorIteratorBase& iter) {
  lib.exec_binary_kernel(iter, "zeta");
 }

-static void logaddexp_mps_kernel(TensorIteratorBase& iter) {
-  lib.exec_binary_kernel(iter, "logaddexp");
-}
-
-static void logaddexp2_mps_kernel(TensorIteratorBase& iter) {
-  lib.exec_binary_kernel(iter, "logaddexp2");
-}
-
 static void xlog1py_mps_kernel(TensorIteratorBase& iter) {
  TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()), "xlog1py_mps not implemented for non-floating types");
  lib.exec_binary_kernel(iter, "xlog1py");
@ -219,8 +211,6 @@ REGISTER_DISPATCH(fmin_stub, &fmin_mps_kernel)
 REGISTER_DISPATCH(copysign_stub, &copysign_mps_kernel)
 REGISTER_DISPATCH(nextafter_stub, &nextafter_mps_kernel)
 REGISTER_DISPATCH(zeta_stub, &zeta_mps_kernel)
-REGISTER_DISPATCH(logaddexp_stub, &logaddexp_mps_kernel);
-REGISTER_DISPATCH(logaddexp2_stub, &logaddexp2_mps_kernel);
 REGISTER_DISPATCH(xlog1py_stub, &xlog1py_mps_kernel)
 REGISTER_DISPATCH(chebyshev_polynomial_t_stub, &chebyshev_polynomial_t_mps_kernel)
 REGISTER_DISPATCH(chebyshev_polynomial_u_stub, &chebyshev_polynomial_u_mps_kernel)
--- a/aten/src/ATen/native/mps/operations/BinaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryOps.mm
@ -17,6 +17,8 @@
 #include <ATen/ops/ge_native.h>
 #include <ATen/ops/gt_native.h>
 #include <ATen/ops/le_native.h>
+#include <ATen/ops/logaddexp2_native.h>
+#include <ATen/ops/logaddexp_native.h>
 #include <ATen/ops/logical_and_native.h>
 #include <ATen/ops/logical_or_native.h>
 #include <ATen/ops/logical_xor_native.h>
@ -275,6 +277,30 @@ TORCH_IMPL_FUNC(pow_Scalar_out_mps)(const Scalar& base, const Tensor& exp, const
  }
 }

+TORCH_IMPL_FUNC(logaddexp_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
+  mps::BinaryOpBlock logaddexp_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
+    MPSGraph* mpsGraph = cachedGraph->graph();
+    MPSGraphTensor* sumTensor =
+        [mpsGraph additionWithPrimaryTensor:[mpsGraph exponentWithTensor:primaryCastTensor name:nil]
+                            secondaryTensor:[mpsGraph exponentWithTensor:secondaryCastTensor name:nil]
+                                       name:nil];
+    return [mpsGraph logarithmWithTensor:sumTensor name:nil];
+  };
+  mps::binaryOpTensor(self, other, output, "logaddexp_out_mps", logaddexp_op_block);
+}
+
+TORCH_IMPL_FUNC(logaddexp2_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
+  mps::BinaryOpBlock logaddexp2_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
+    MPSGraph* mpsGraph = cachedGraph->graph();
+    MPSGraphTensor* sumTensor =
+        [mpsGraph additionWithPrimaryTensor:[mpsGraph exponentBase2WithTensor:primaryCastTensor name:nil]
+                            secondaryTensor:[mpsGraph exponentBase2WithTensor:secondaryCastTensor name:nil]
+                                       name:nil];
+    return [mpsGraph logarithmBase2WithTensor:sumTensor name:nil];
+  };
+  mps::binaryOpTensor(self, other, output, "logaddexp2_out_mps", logaddexp2_op_block);
+}
+
 TORCH_IMPL_FUNC(xlogy_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) {
  mps::BinaryOpBlock xlogy_op_block = ^BinaryOpFn(cachedGraph, primaryCastTensor, secondaryCastTensor) {
    MPSGraph* mpsGraph = cachedGraph->graph();
--- a/aten/src/ATen/native/mps/operations/Distributions.mm
+++ b/aten/src/ATen/native/mps/operations/Distributions.mm
@ -57,7 +57,6 @@ Tensor& random_mps_impl(Tensor& self,
  if (self.numel() == 0) {
    return self;
  }
-  at::assert_no_internal_overlap(self);
  // MPS random is broken for 5D+ tensors, see https://github.com/pytorch/pytorch/issues/147624
  const auto need_reshape = self.ndimension() > 4;
  auto mps_gen = get_generator_or_default<MPSGeneratorImpl>(gen, at::mps::detail::getDefaultMPSGenerator());
@ -154,16 +153,8 @@ Tensor& random_mps_impl(Tensor& self,
      feeds[meanPlaceholder.getMPSGraphTensor()] = meanPlaceholder.getMPSGraphTensorData();
    }

-    // Handle non-contiguous output tensors by creating a contiguous temporary
-    const auto needs_gather = needsGather(self);
-    Tensor self_ = needs_gather ? at::empty_like(self, MemoryFormat::Contiguous) : self;
-    Placeholder outputPlaceholder = Placeholder(cachedGraph->resultTensor, self_);
+    Placeholder outputPlaceholder = Placeholder(cachedGraph->resultTensor, self);
    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
-
-    // Copy results back to original non-contiguous output
-    if (needs_gather) {
-      self.copy_(self_);
-    }
  }

  return self;
--- a/aten/src/ATen/native/mps/operations/Indexing.mm
+++ b/aten/src/ATen/native/mps/operations/Indexing.mm
@ -617,9 +617,6 @@ Tensor& index_select_out_mps(const Tensor& self, int64_t dim, const Tensor& inde
  TORCH_CHECK(self.scalar_type() == output.scalar_type(),
              "index_select(): self and output must have the same scalar type");
  TORCH_CHECK(dim == 0 || dim < self.dim(), "index_select(): Indexing dim ", dim, " is out of bounds of tensor");
-  at::assert_no_internal_overlap(output);
-  at::assert_no_overlap(output, self);
-  at::assert_no_overlap(output, index);
  auto output_size = self.sizes().vec();
  if (self.dim() > 0) {
    output_size[dim] = num_indices;
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@ -1028,18 +1028,15 @@ TORCH_IMPL_FUNC(prod_out_mps)
 }

 TORCH_IMPL_FUNC(amax_out_mps)(const Tensor& input_t, IntArrayRef dim, bool keepdim, const Tensor& output_t) {
-  TORCH_CHECK(!c10::isComplexType(input_t.scalar_type()), "amax is not defined for complex types");
  reduction_out_mps(input_t, dim, keepdim, std::nullopt, output_t, MPSReductionType::AMAX, "amax_out_mps");
 }

 TORCH_IMPL_FUNC(amin_out_mps)(const Tensor& input_t, IntArrayRef dim, bool keepdim, const Tensor& output_t) {
-  TORCH_CHECK(!c10::isComplexType(input_t.scalar_type()), "amin is not defined for complex types");
  reduction_out_mps(input_t, dim, keepdim, std::nullopt, output_t, MPSReductionType::AMIN, "amin_out_mps");
 }

 TORCH_IMPL_FUNC(aminmax_out_mps)
 (const Tensor& input_t, std::optional<int64_t> dim_opt, bool keepdim, const Tensor& min_t, const Tensor& max_t) {
-  TORCH_CHECK(!c10::isComplexType(input_t.scalar_type()), "aminmax is not defined for complex types");
  reduction_out_mps(input_t,
                    dim_opt.has_value() ? OptionalIntArrayRef({*dim_opt}) : std::nullopt,
                    keepdim,
--- a/aten/src/ATen/native/mps/operations/Sort.mm
+++ b/aten/src/ATen/native/mps/operations/Sort.mm
@ -31,7 +31,6 @@ void kthvalue_out_mps_impl(const Tensor& self, int64_t k, int64_t dim, Tensor& v
    indices.copy_(values.toType(at::ScalarType::Long));
    return;
  }
-  TORCH_CHECK_NOT_IMPLEMENTED(!c10::isComplexType(self.scalar_type()), "kthvalue is not implemented for complex types");
  // issue #154890, raising error to prevent crash within MPSGraph until
  // workaround is implemented.
  TORCH_CHECK(self.dim() - dim <= 4, "On-going issue on MPSGraph topk when ndims() - axis > 4, see issue #154890");
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -3622,7 +3622,8 @@
  structured: True
  structured_inherits: TensorIteratorBase
  dispatch:
-    CPU, CUDA, MPS: logaddexp_out
+    CPU, CUDA: logaddexp_out
+    MPS: logaddexp_out_mps
  tags: pointwise

 - func: logaddexp(Tensor self, Tensor other) -> Tensor
@ -3634,7 +3635,8 @@
  structured: True
  structured_inherits: TensorIteratorBase
  dispatch:
-    CPU, CUDA, MPS: logaddexp2_out
+    CPU, CUDA: logaddexp2_out
+    MPS: logaddexp2_out_mps
  tags: pointwise

 - func: logaddexp2(Tensor self, Tensor other) -> Tensor
@ -15095,7 +15097,7 @@
    CPU: _scaled_dot_product_flash_attention_cpu
  tags: nondeterministic_seeded

- func: _scaled_dot_product_fused_attention_overrideable(Tensor query, Tensor key, Tensor value, Tensor? attn_bias=None, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None, bool compute_log_sumexp=True) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
+- func: _scaled_dot_product_fused_attention_overrideable(Tensor query, Tensor key, Tensor value, Tensor? attn_bias=None, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
  dispatch:
    CompositeExplicitAutograd: _scaled_dot_product_fused_attention_overrideable
    XPU: _scaled_dot_product_fused_attention_overrideable_xpu
@ -15119,7 +15121,6 @@
  variants: function
  dispatch:
    CompositeExplicitAutograd: _scaled_dot_product_fused_attention_overrideable_backward
-    XPU: _scaled_dot_product_fused_attention_overrideable_backward_xpu

 - func: _scaled_dot_product_efficient_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0.0, bool is_causal=False, *, float? scale=None) -> (Tensor output, Tensor log_sumexp, Tensor philox_seed, Tensor philox_offset)
  dispatch:
--- a/aten/src/ATen/native/quantized/cpu/UpSampleBilinear2d.cpp
+++ b/aten/src/ATen/native/quantized/cpu/UpSampleBilinear2d.cpp
@ -73,7 +73,8 @@ void upsample_bilinear2d_out_frame(
  const auto rwidth = area_pixel_compute_scale<float>(
      input_width, output_width, align_corners, scales_w);

-  float output_scale = static_cast<float>(output.q_scale() / input.q_scale());
+  // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions)
+  float output_scale = output.q_scale() / input.q_scale();

  const int64_t input_q_zero_point = input.q_zero_point();
  const int64_t output_q_zero_point = output.q_zero_point();
--- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
+++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp
@ -148,7 +148,7 @@ Tensor qcat_nhwc_kernel(
          // Vectorized loop
          if (c + VLEN <= curr_C) {
            auto curr_scale_vec = Vectorized<float>(curr_scale);
-            auto curr_zero_pt_vec = Vectorized<float>(curr_zero_pt);
+            auto curr_zero_pt_vec = Vectorized<float>((float)curr_zero_pt);
            auto scale_neg_zp_premul = curr_scale_vec * curr_zero_pt_vec.neg();
            for (; c + VLEN <= curr_C; c += VLEN) {
              auto inp_vec = Vec::loadu(iptr + c);
@ -174,7 +174,7 @@ Tensor qcat_nhwc_kernel(
          int64_t elem_size = curr_C - c;
          if ((VLEN == 4 * kVLEN) && elem_size >= kVLEN) {
            auto curr_scale_vec = Vectorized<float>(curr_scale);
-            auto curr_zero_pt_vec = Vectorized<float>(curr_zero_pt);
+            auto curr_zero_pt_vec = Vectorized<float>((float)curr_zero_pt);
            auto scale_neg_zp_premul = curr_scale_vec * curr_zero_pt_vec.neg();
            int64_t vec_num = elem_size / kVLEN;
            std::array<typename scalar_t::underlying, VLEN> buf_in{};
@ -611,10 +611,12 @@ void qrelu_kernel(const Tensor& qx, Tensor& qy) {
 void leaky_qrelu_out_kernel(Tensor& out, const Tensor& qx,
                                   const Scalar& negval_) {
  int64_t i_zp = qx.q_zero_point();
-  float i_scale = static_cast<float>(qx.q_scale());
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  float i_scale = qx.q_scale();

  int64_t o_zp = out.q_zero_point();
-  float o_scale = static_cast<float>(out.q_scale());
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  float o_scale = out.q_scale();
  float o_inv_scale = 1.0f / o_scale;

  float negval = negval_.to<float>();
@ -625,8 +627,8 @@ void leaky_qrelu_out_kernel(Tensor& out, const Tensor& qx,
    Vec zero_vec = Vec(0.0f);
    Vec one_vec = Vec(1.0f);

-    Vec i_scale_vec = Vec(i_scale);
-    Vec i_zp_vec = Vec(i_zp);
+    Vec i_scale_vec = Vec((float)i_scale);
+    Vec i_zp_vec = Vec((float)i_zp);
    Vec i_scale_zp_neg_premul_vec = i_scale_vec * i_zp_vec.neg();

    Vec negval_vec = Vec(negval);
@ -736,9 +738,10 @@ void qprelu_out_kernel(Tensor& out,

 void qgelu_kernel(const Tensor& qx, Tensor& qy, GeluType approximate) {
  int64_t zero_point = qx.q_zero_point();
-  float scale = static_cast<float>(qx.q_scale());
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  float scale = qx.q_scale();
  auto scale_vec = Vectorized<float>(scale);
-  auto zero_point_vec = Vectorized<float>(zero_point);
+  auto zero_point_vec = Vectorized<float>((float)zero_point);
  auto scale_neg_zp_premul_vec = scale_vec * zero_point_vec.neg();
  int64_t output_zero_point = zero_point;
  float output_scale = scale;
@ -825,9 +828,10 @@ void qgelu_kernel(const Tensor& qx, Tensor& qy, GeluType approximate) {
 void qsigmoid_kernel(
    const Tensor& qx, Tensor& qy, double output_scale, int64_t output_zero_point ) {
  int64_t zero_point = qx.q_zero_point();
-  float scale = static_cast<float>(qx.q_scale());
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  float scale = qx.q_scale();
  auto scale_vec = Vectorized<float>(scale);
-  auto zero_point_vec = Vectorized<float>(zero_point);
+  auto zero_point_vec = Vectorized<float>((float)zero_point);

  AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qsigmoid", [&]() {
    float inv_output_scale = 1.0 / output_scale;
@ -866,9 +870,10 @@ void qsigmoid_kernel(

 void qhardsigmoid_kernel(const Tensor& qx, Tensor& qy) {
  int64_t zero_point = qx.q_zero_point();
-  float scale = static_cast<float>(qx.q_scale());
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  float scale = qx.q_scale();
  auto scale_vec = Vectorized<float>(scale);
-  auto zero_point_vec = Vectorized<float>(zero_point);
+  auto zero_point_vec = Vectorized<float>((float)zero_point);
  auto scale_neg_zp_premul_vec = scale_vec * zero_point_vec.neg();

  AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qhardsigmoid", [&]() {
@ -1024,10 +1029,13 @@ void qthreshold_kernel(

  // defines input and output scales and zero_points
  int64_t input_zero_point = qx.q_zero_point();
-  float input_scale = static_cast<float>(qx.q_scale());
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  float input_scale = qx.q_scale();
  int64_t output_zero_point = qy.q_zero_point();
-  float output_scale = static_cast<float>(qy.q_scale());
-  float inv_output_scale = static_cast<float>(1.0 / output_scale);
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  float output_scale = qy.q_scale();
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  float inv_output_scale = 1.0 / output_scale;

  AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qthreshold", [&]() {
    qy = at::_empty_affine_quantized(
@ -1088,7 +1096,8 @@ void qhardswish_kernel(const Tensor& qx, Tensor& qy) {

  const auto o_scale = qy.q_scale();
  const auto o_zero_point = qy.q_zero_point();
-  const float o_inv_scale = static_cast<float>(1.0 / o_scale);
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  const float o_inv_scale = 1.0 / o_scale;

  using fVec = Vectorized<float>;
  fVec i_scale_vec(i_scale);
@ -1126,9 +1135,10 @@ void qhardswish_kernel(const Tensor& qx, Tensor& qy) {

 void qtanh_kernel(const Tensor& qx, Tensor& qy) {
  int64_t zero_point = qx.q_zero_point();
-  float scale = static_cast<float>(qx.q_scale());
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  float scale = qx.q_scale();
  auto scale_vec = Vectorized<float>(scale);
-  auto zero_point_vec = Vectorized<float>(zero_point);
+  auto zero_point_vec = Vectorized<float>((float)zero_point);
  auto scale_neg_zp_premul_vec = scale_vec * zero_point_vec.neg();

  AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qtanh", [&]() {
@ -1188,13 +1198,16 @@ void qelu_kernel(
  // they are NOT related to the quantization scale term

  int64_t i_zp = qx.q_zero_point();
-  float i_scale = static_cast<float>(qx.q_scale());
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  float i_scale = qx.q_scale();

  // In a future PR, we can improve on output scale and zero_point
  // selection.
  int64_t o_zp = qy.q_zero_point();
-  float o_scale = static_cast<float>(qy.q_scale());
-  float inv_o_scale = static_cast<float>(1.0 / o_scale);
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  float o_scale = qy.q_scale();
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  float inv_o_scale = 1.0 / o_scale;

  float alpha_float = alpha.to<float>();
  float scale_coef = scale.to<float>();
@ -1214,7 +1227,7 @@ void qelu_kernel(
    Vec scale_coef_vec = Vec(scale_coef);
    Vec input_scale_coef_vec = Vec(input_scale_coef);
    Vec i_scale_vec = Vec(i_scale);
-    Vec i_zero_point_vec = Vec(i_zp);
+    Vec i_zero_point_vec = Vec((float)i_zp);
    Vec i_scale_neg_zp_premul_vec = i_scale_vec * i_zero_point_vec.neg();

    cpu_kernel_vec(
@ -1313,20 +1326,23 @@ void qadd_scalar_kernel(Tensor& out, const Tensor& self, const Scalar& other) {
 template <bool ReLUFused = false>
 void qadd_kernel(Tensor& out, const Tensor& self, const Tensor& other) {
  int64_t zero_point = out.q_zero_point();
-  float scale = static_cast<float>(out.q_scale());
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  float scale = out.q_scale();
  float inv_scale = 1.0f / scale;
  int64_t self_zero_point = self.q_zero_point();
-  float self_scale = static_cast<float>(self.q_scale());
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  float self_scale = self.q_scale();
  int64_t other_zero_point = other.q_zero_point();
-  float other_scale = static_cast<float>(other.q_scale());
+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+  float other_scale = other.q_scale();

  // Broadcast out the parameters here to amortize out that cost across
  // loop iterations.
  // TODO: we can optimize dequantization by doing a premultiplication
  // of the zero point by scale and doing FMA on scale*x_q - (scale*zero_point)
-  auto self_zero_point_vec = Vectorized<float>(self_zero_point);
+  auto self_zero_point_vec = Vectorized<float>((float)self_zero_point);
  auto self_scale_vec = Vectorized<float>(self_scale);
-  auto other_zero_point_vec = Vectorized<float>(other_zero_point);
+  auto other_zero_point_vec = Vectorized<float>((float)other_zero_point);
  auto other_scale_vec = Vectorized<float>(other_scale);

  auto self_scale_neg_zp_premul_vec = self_scale_vec * self_zero_point_vec.neg();
@ -2949,7 +2965,7 @@ void quantized_normalize_kernel(
    const bool beta_null = beta_data == nullptr;
    int64_t x_zp = X.q_zero_point();
    float x_scale = X.q_scale();
-    fVec x_zp_vec(x_zp);
+    fVec x_zp_vec((float)x_zp);
    fVec one_vec(1.0f);
    fVec zero_vec(0.0f);
    float x_fake_scale = 1.0f;
@ -3237,7 +3253,7 @@ void quantized_groupnorm_nhwc_kernel(
    const bool beta_null = beta_data == nullptr;
    int64_t x_zp = X.q_zero_point();
    float x_scale = X.q_scale();
-    fVec x_zp_vec(x_zp);
+    fVec x_zp_vec((float)x_zp);
    fVec one_vec(1.0f);
    fVec zero_vec(0.0f);
    float x_fake_scale = 1.0f;
--- a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp
@ -414,6 +414,7 @@ at::Tensor& PackedLinearWeightFp16::apply_dynamic_impl(
  TORCH_CHECK(input.size(input.dim() - 1) == packed_weight_fp16.numRows())
  TORCH_CHECK(input.dim() >= 2);

+  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
  const int64_t M = size_to_dim_(input.dim() - 1, input.sizes());
  const int64_t N = packed_weight_fp16.numCols();
  std::vector<int64_t> output_sizes = input.sizes().vec();
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@ -467,28 +467,6 @@ Tensor sparse_coo_tensor(const Tensor& indices, const Tensor& values, IntArrayRe
      !options.has_layout() || options.layout() == kSparse,
      "expected sparse layout, but got layout ",
      options.layout());
-
-  if (indices.numel() > 0) {
-    Tensor min_indices =
-        std::get</* values */ 0>(indices.min(/* dim */ 1, /* keepdim */ false));
-    Tensor cpu_min_indices;
-    if (!indices.is_cpu()) {
-      cpu_min_indices = min_indices.to(at::DeviceType::CPU);
-    } else {
-      cpu_min_indices = min_indices;
-    }
-    auto cpu_min_indices_accessor = cpu_min_indices.accessor<int64_t, 1>();
-    for (const auto d : c10::irange(indices.size(0))) {
-      int64_t min_index_in_dim = cpu_min_indices_accessor[d];
-      TORCH_CHECK(
-          min_index_in_dim >= 0,
-          "found negative index ",
-          min_index_in_dim,
-          " for dim ",
-          d);
-    }
-  }
-
  return at::native::_sparse_coo_tensor_unsafe(
      indices,
      values,
--- a/aten/src/ATen/native/transformers/attention.cpp
+++ b/aten/src/ATen/native/transformers/attention.cpp
@ -768,11 +768,8 @@ Tensor scaled_dot_product_attention(
      return std::get<0>(out_and_lse);
    }
    case SDPBackend::overrideable: {
-      bool compute_logsumexp = should_compute_logsumexp(query_, key, value);
-      compute_logsumexp = compute_logsumexp ||
-          (at::GradMode::is_enabled() && attn_mask.has_value() && attn_mask.value().requires_grad());
      auto out_lse_softmax = at::_scaled_dot_product_fused_attention_overrideable(
-          query_, key, value, attn_mask, dropout_p, is_causal, false /*return_debug_mask*/, scale, compute_logsumexp);
+          query_, key, value, attn_mask, dropout_p, is_causal, false /*return_debug_mask*/, scale);
      return std::get<0>(out_lse_softmax);
    }
    case SDPBackend::math: {
@ -1018,8 +1015,7 @@ _scaled_dot_product_fused_attention_overrideable(
    double dropout_p,
    bool is_causal,
    bool return_debug_mask,
-    std::optional<double> scale,
-    bool compute_logsumexp) {
+    std::optional<double> scale) {
  TORCH_CHECK_NOT_IMPLEMENTED(false, "_scaled_dot_product_fused_attention_overrideable not implemented. This is an operator for privateuse1 backends, please use TORCH_LIBRARY_IMPL to override this function ");
 }

--- a/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp
+++ b/aten/src/ATen/native/transformers/cuda/flash_attn/flash_api.cpp
@ -22,7 +22,6 @@
 #else
 #include <ATen/ops/empty.h>
 #include <ATen/ops/empty_like.h>
-#include <ATen/ops/zeros_like.h>
 #include <ATen/ops/reshape.h>
 #include <ATen/ops/scalar_tensor.h>
 #include <ATen/ops/sum.h>
@ -43,6 +42,7 @@ C10_DIAGNOSTIC_POP()
 #include <static_switch.h>
 #include <ATen/native/transformers/cuda/flash_attn/flash_api.h>

+
 #include <c10/util/Exception.h>

 namespace FLASH_NAMESPACE {
@ -417,26 +417,6 @@ mha_fwd(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x head
    const int head_size_og = sizes[3];
    const int seqlen_k = k.size(1);
    const int num_heads_k = k.size(2);
-
-    if (batch_size == 0) {
-        auto opts = q.options();
-        at::Tensor out = at::empty({0, seqlen_q, num_heads, head_size_og}, opts);
-        at::Tensor q_padded = at::empty({0, seqlen_q, num_heads, head_size_og}, opts);
-        at::Tensor k_padded = at::empty({0, seqlen_k, num_heads_k, head_size_og}, opts);
-        at::Tensor v_padded = at::empty({0, seqlen_k, num_heads_k, head_size_og}, opts);
-        at::Tensor softmax_lse = at::empty({0, num_heads, seqlen_q}, opts.dtype(at::kFloat));
-        at::Tensor rng_state = at::empty({2}, at::dtype(c10::kUInt64).device(at::kCUDA));
-        at::Tensor _unused = at::empty({}, at::dtype(c10::kUInt64).device(at::kCUDA));
-        at::Tensor p = at::empty({0}, opts);
-        if (return_softmax) {
-            auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
-            const int seqlen_q_rounded = round_multiple(seqlen_q, 128);
-            const int seqlen_k_rounded = round_multiple(seqlen_k, 128);
-            p = at::empty({0, num_heads, seqlen_q_rounded, seqlen_k_rounded}, opts);
-        }
-        return {std::move(out), std::move(q_padded), std::move(k_padded), std::move(v_padded), std::move(softmax_lse), std::move(rng_state), _unused, std::move(p)};
-    }
-
    TORCH_CHECK(batch_size > 0, "batch size must be positive");
    TORCH_CHECK(head_size_og % 8 == 0, "head_size must be a multiple of 8, this is ensured by padding!");
    TORCH_CHECK(head_size_og <= 256, "FlashAttention forward only supports head dimension at most 256");
@ -567,7 +547,7 @@ mha_fwd(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x head
        q_padded = q_padded.transpose(1, 2).reshape({batch_size, 1, num_heads_k * seqlen_q, head_size_og});
        softmax_lse = softmax_lse.reshape({batch_size, num_heads_k * seqlen_q, 1});
    }
-    return {std::move(out), std::move(q_padded), std::move(k_padded), std::move(v_padded), std::move(softmax_lse), std::move(rng_state), std::move(_unused), std::move(p)};
+    return {out, q_padded, k_padded, v_padded, softmax_lse, rng_state, _unused, p};
 }

 std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor, at::Tensor>
@ -872,6 +852,7 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
    TORCH_CHECK(k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
    TORCH_CHECK(v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
    TORCH_CHECK(out.stride(-1) == 1, "out tensor must have contiguous last dimension");
+    TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension");

    const auto sizes = q.sizes();

@ -882,20 +863,6 @@ mha_bwd(const at::Tensor &dout,  // batch_size x seqlen_q x num_heads, x head_si
    const int head_size = sizes[3];
    const int seqlen_k = k.size(1);
    const int num_heads_k = k.size(2);
-
-    if (batch_size == 0) {
-        auto opts = q.options();
-        at::Tensor dq = at::empty_like(q);
-        at::Tensor dk = at::empty_like(k);
-        at::Tensor dv = at::empty_like(v);
-        auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
-        const int seqlen_q_rounded = round_multiple(seqlen_q, 128);
-        at::Tensor softmax_d = at::empty({0, num_heads, seqlen_q_rounded}, opts.dtype(at::kFloat));
-        return {dq, dk, dv, softmax_d};
-    }
-
-    TORCH_CHECK(dout.stride(-1) == 1, "dout tensor must have contiguous last dimension");
-
    TORCH_CHECK(batch_size > 0, "batch size must be positive");
    TORCH_CHECK(head_size % 8 == 0, "head_size should be a multiple of 8");
    TORCH_CHECK(head_size_og % 8 == 0, "head_size_og should be a multiple of 8, this is ensured by padding!");
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@ -1837,10 +1837,6 @@ class BenchmarkRunner:
    def skip_models_for_cuda(self):
        return set()

-    @property
-    def skip_models_for_xpu(self):
-        return set()
-
    @property
    def skip_models_for_cpu(self):
        return set()
@ -3931,8 +3927,6 @@ def run(runner, args, original_dir=None):
            runner.skip_models.update(runner.skip_models_for_cpu_aarch64)
    elif args.devices == ["cuda"]:
        runner.skip_models.update(runner.skip_models_for_cuda)
-    elif args.devices == ["xpu"]:
-        runner.skip_models.update(runner.skip_models_for_xpu)

    if not args.multiprocess:
        runner.skip_models.update(runner.skip_multiprocess_models)
--- a/benchmarks/dynamo/torchbench.py
+++ b/benchmarks/dynamo/torchbench.py
@ -124,10 +124,6 @@ class TorchBenchmarkRunner(BenchmarkRunner):
    def skip_models_for_cuda(self):
        return self._skip["device"]["cuda"]

-    @property
-    def skip_models_for_xpu(self):
-        return self._skip["device"]["xpu"]
-
    @property
    def skip_models_for_freezing_cuda(self):
        return self._skip["freezing"]["cuda"]
--- a/benchmarks/dynamo/torchbench.yaml
+++ b/benchmarks/dynamo/torchbench.yaml
@ -217,9 +217,6 @@ skip:

    cuda: []

-    xpu:
-      - *DETECTRON2_MODELS
-
  test:
    training:
      - *DETECTRON2_MODELS
--- a/benchmarks/transformer/config_utils.py
+++ b/benchmarks/transformer/config_utils.py
@ -1,157 +0,0 @@
-"""Configuration utilities for parsing JSON and YAML config files."""
-
-import json
-import re
-
-
-def heads_input_type(s: str) -> tuple[int, int]:
-    """Convert string format 'Hq,Hkv' to tuple (Hq, Hkv)."""
-    try:
-        hq, hkv = map(int, s.split(","))
-        return hq, hkv
-    except Exception as e:
-        raise ValueError("Heads must be Hq,Hkv") from e
-
-
-default_config = {
-    "dynamic": False,
-    "calculate_bwd": False,
-    "dtype": "bfloat16",
-    "b": [2, 8, 16],
-    "nh": ["16,16", "16,2"],
-    "s": [512, 1024, 4096],
-    "d": [64, 128],
-    "mods": ["noop", "causal", "alibi", "sliding_window"],
-    "backend": ["efficient"],
-    "max_autotune": False,
-    "decoding": False,
-    "kv_size": None,
-    "throughput": True,
-    "save_path": None,
-    "output_json_for_dashboard": None,
-    "benchmark_name": "PyTorch operator microbenchmark",
-}
-
-
-def load_config_file(config_path: str) -> dict:
-    """Load configuration from JSON or YAML file.
-
-    Automatically converts 'nh' field from strings to tuples.
-
-    Args:
-        config_path: Path to the configuration file
-
-    Returns:
-        Dictionary containing the configuration
-
-    Raises:
-        FileNotFoundError: If config file doesn't exist
-        ValueError: If config file format is invalid
-    """
-    with open(config_path) as f:
-        config_str = f.read()
-
-    # Try to load as JSON first
-    try:
-        config = json.loads(config_str)
-    except json.JSONDecodeError:
-        # Fall back to YAML parsing
-        config = _parse_simple_yaml(config_str)
-
-    # Apply automatic conversions for 'nh' field
-    if "nh" in config and isinstance(config["nh"], list):
-        config["nh"] = [
-            heads_input_type(h) if isinstance(h, str) else h for h in config["nh"]
-        ]
-
-    return config
-
-
-def _parse_simple_yaml(yaml_str: str) -> dict:
-    """Simple YAML parser for basic configs (without external dependencies).
-
-    Supports:
-    - key: value pairs
-    - booleans (true/false)
-    - null values
-    - integers and floats
-    - strings (quoted and unquoted)
-    - lists in JSON format [item1, item2, ...]
-    - comments (lines starting with # or after #)
-
-    Args:
-        yaml_str: YAML content as string
-
-    Returns:
-        Dictionary containing parsed YAML content
-    """
-    config = {}
-
-    for line in yaml_str.split("\n"):
-        # Remove comments
-        line = line.split("#")[0].strip()
-
-        if not line or ":" not in line:
-            continue
-
-        key, value = line.split(":", 1)
-        key = key.strip()
-        value = value.strip()
-
-        # Parse value based on type
-        if value.lower() == "true":
-            config[key] = True
-        elif value.lower() == "false":
-            config[key] = False
-        elif value.lower() in ("null", "none", ""):
-            config[key] = None
-        elif value.startswith("[") and value.endswith("]"):
-            # Parse list - handle quoted strings properly
-            pattern = r'"([^"]+)"|\'([^\']+)\'|([^,\[\]\s]+)'
-            matches = re.findall(pattern, value[1:-1])  # Remove [ ]
-            parsed_items = []
-            for match in matches:
-                # match is a tuple of (double_quoted, single_quoted, unquoted)
-                item = match[0] or match[1] or match[2]
-                item = item.strip()
-                if item:
-                    try:
-                        parsed_items.append(int(item))
-                    except ValueError:
-                        parsed_items.append(item)
-            config[key] = parsed_items
-        elif value.startswith(('"', "'")):
-            config[key] = value.strip("\"'")
-        else:
-            # Try to parse as number
-            try:
-                config[key] = int(value)
-            except ValueError:
-                try:
-                    config[key] = float(value)
-                except ValueError:
-                    config[key] = value
-
-    return config
-
-
-def print_default_config(output_format: str) -> None:
-    """Print a default configuration template in JSON or YAML format.
-
-    Args:
-        output_format: Either "json" or "yaml"
-    """
-    if output_format == "json":
-        print(json.dumps(default_config, indent=2))
-    else:  # yaml
-        for key, value in default_config.items():
-            if value is None:
-                print(f"{key}: null")
-            elif isinstance(value, bool):
-                print(f"{key}: {str(value).lower()}")
-            elif isinstance(value, str):
-                print(f'{key}: "{value}"')
-            elif isinstance(value, list):
-                print(f"{key}: {json.dumps(value)}")
-            else:
-                print(f"{key}: {value}")
--- a/benchmarks/transformer/configs/config_basic.yaml
+++ b/benchmarks/transformer/configs/config_basic.yaml
@ -1,29 +0,0 @@
-# Basic benchmark configuration for PyTorch transformer benchmarks
-# Usage: python score_mod.py --config config_basic.yaml
-
-# Core parameters
-dynamic: false
-calculate_bwd: true
-dtype: "bfloat16"
-
-# Shape parameters - larger sweep
-b: [1, 2, 4, 8, 16]  # batch sizes
-nh: ["16,16", "16,2", "32,32", "32,4"]  # [query_heads,key_value_heads]
-s: [512, 1024, 2048, 4096, 8192]  # sequence lengths
-d: [64, 128]  # head dimensions (limited to 128 for Flash Attention/cuDNN compatibility)
-
-# All attention types
-mods: ["noop", "causal", "rel", "head_bias", "alibi", "sliding_window", "prefix_lm", "softcap"]
-
-# Multiple backends for comparison (SDPA + Flash Attention) - flex is always included internally
-backend: ["efficient", "math", "cudnn", "fav2"]
-max_autotune: true  # Enable torch.compile with max-autotune for optimal performance
-
-# Decoding and cache settings
-decoding: false
-kv_size: null
-
-# Metrics and output
-throughput: true  # Calculate memory bandwidth & TFLOPS
-save_path: "comprehensive_results.csv"  # Save to CSV
-output_json_for_dashboard: "attn_bench_basic.json"
--- a/benchmarks/transformer/score_mod.py
+++ b/benchmarks/transformer/score_mod.py
@ -1,19 +1,15 @@
 import argparse
 import csv
-import gc
 import itertools
-import json
 import random
-import sys
 from collections import defaultdict
 from collections.abc import Callable
 from contextlib import nullcontext
 from dataclasses import asdict, dataclass
-from functools import partial, wraps
-from typing import Literal, Optional, Union
+from functools import partial
+from typing import Optional, Union

 import numpy as np
-from config_utils import heads_input_type, load_config_file, print_default_config
 from tabulate import tabulate
 from tqdm import tqdm

@ -37,96 +33,6 @@ torch._dynamo.config.recompile_limit = 1000
 from torch._inductor.runtime.benchmarking import benchmarker


-def cleanup_memory():
-    """Aggressively free GPU memory"""
-    torch.cuda.empty_cache()
-    gc.collect()
-    if torch.cuda.is_available():
-        torch.cuda.synchronize()
-
-
-def safe_backend(backend_name=None, return_dict=False):
-    """Decorator that wraps backend functions with error handling
-
-    Args:
-        backend_name: Name of the backend for error messages
-        return_dict: If True, returns dict of results for all backends (for run_single_experiment)
-                     If False, returns single ExperimentResults (for individual backend functions)
-    """
-
-    def decorator(func):
-        @wraps(func)
-        def wrapper(config, *args, **kwargs):
-            try:
-                return func(config, *args, **kwargs)
-            except torch.OutOfMemoryError:
-                print(
-                    f"[SKIP] OOM for {backend_name or func.__name__} with shape {config.shape}"
-                )
-                cleanup_memory()
-            except RuntimeError as e:
-                error_msg = str(e)
-                if "out of resource" in error_msg or "OutOfMemoryError" in error_msg:
-                    print(
-                        f"[SKIP] Triton OOM for {backend_name or func.__name__} with shape {config.shape}"
-                    )
-                    cleanup_memory()
-                elif "No valid triton configs" in error_msg:
-                    print(
-                        f"[SKIP] No valid Triton config for {backend_name or func.__name__} with shape {config.shape}"
-                    )
-                else:
-                    print(
-                        f"[SKIP] Runtime error for {backend_name or func.__name__} with shape {config.shape}: {str(e)[:100]}"
-                    )
-            except Exception as e:
-                print(
-                    f"[SKIP] Error for {backend_name or func.__name__} with shape {config.shape}: {str(e)[:100]}"
-                )
-
-            # Return appropriate NaN result based on function type
-            if return_dict:
-                # For run_single_experiment: return dict with NaN for all backends
-                nan_result = ExperimentResults(
-                    fwd_time=float("nan"),
-                    bwd_time=float("nan") if config.calculate_bwd_time else None,
-                )
-                results = dict.fromkeys(config.backends, nan_result)
-                results["flex"] = ExperimentResults(
-                    fwd_time=float("nan"),
-                    bwd_time=float("nan") if config.calculate_bwd_time else None,
-                    sparsity=None,
-                )
-                return results
-            else:
-                # For individual backend functions: return single ExperimentResults
-                return ExperimentResults(
-                    fwd_time=float("nan"),
-                    bwd_time=float("nan") if config.calculate_bwd_time else None,
-                )
-
-        return wrapper
-
-    return decorator
-
-
-# Type definitions
-Backend = Literal["math", "efficient", "cudnn", "fav2", "fav3", "fakv", "og-eager"]
-AttentionType = Literal[
-    "noop",
-    "causal",
-    "rel",
-    "head_bias",
-    "alibi",
-    "sliding_window",
-    "document_mask",
-    "prefix_lm",
-    "softcap",
-]
-DtypeString = Literal["bfloat16", "float16", "float32"]
-SpeedupType = Literal["fwd", "bwd"]
-
-
 def benchmark_torch_function_in_microseconds(func: Callable, *args, **kwargs) -> float:
    # warmup
    for _ in range(5):
@ -142,7 +48,6 @@ class ExperimentConfig:
    calculate_bwd_time: bool
    cal_bandwidth: bool
    backends: list[str]
-    max_autotune: bool

    def __post_init__(self):
        assert len(self.shape) == 6, (
@ -157,7 +62,6 @@ class ExperimentConfig:
        d.pop("cal_bandwidth", None)
        d["shape(B,Hq,M,Hkv,N,D)"] = d.pop("shape")
        d.pop("backends", None)
-        d.pop("max_autotune", False)
        return d


@ -305,7 +209,6 @@ def query_key_value_clones(
    return query_ref, key_ref, value_ref


-@safe_backend("SDPA")
 def run_single_backend_sdpa(
    config: ExperimentConfig,
    query: torch.Tensor,
@ -320,7 +223,6 @@ def run_single_backend_sdpa(
    backend_context = get_backend_context(backend)
    with backend_context:
        _device = torch.device("cuda")
-
        eager_sdpa = generate_eager_sdpa(
            config.attn_type, config.shape, config.dtype, block_mask, score_mod
        )
@ -388,7 +290,6 @@ def run_single_backend_sdpa(
            )


-@safe_backend("FlashAttention")
 def run_single_backend_FA(
    config: ExperimentConfig,
    query: torch.Tensor,
@ -400,9 +301,9 @@ def run_single_backend_FA(
    mask_kwargs,
    backend: str,
 ) -> ExperimentResults:
-    assert backend in ["fav3", "fakv"]
+    assert backend in ["fav2", "fav3", "fakv"]
    # Generate callable for specific backend.
-    if backend in ["fav3"]:
+    if backend in ["fav2", "fav3"]:
        FA = generate_FA_callable(
            config.attn_type, config.shape, config.dtype, backend, **mask_kwargs
        )
@ -453,10 +354,10 @@ def run_single_backend_FA(
    )


-@safe_backend("flex_attention", return_dict=True)
 def run_single_experiment(
    config: ExperimentConfig,
    dynamic=False,
+    max_autotune=False,
 ) -> dict[str, ExperimentResults]:
    device = torch.device("cuda")
    batch_size, q_heads, q_seq_len, kv_heads, kv_seq_len, head_dim = config.shape
@ -476,7 +377,7 @@ def run_single_experiment(
    block_mask, mask_kwargs = generate_block_mask(config.attn_type, config.shape)
    kernel_options = get_kernel_options(config.attn_type, config.shape)

-    if config.max_autotune:
+    if max_autotune:
        compiled_sdpa = torch.compile(
            flex_attention, dynamic=dynamic, mode="max-autotune-no-cudagraphs"
        )
@ -506,7 +407,7 @@ def run_single_experiment(

    results = {}
    for backend in config.backends:
-        if backend in ["fav3", "fakv"]:
+        if backend in ["fav2", "fav3", "fakv"]:
            results[backend] = run_single_backend_FA(
                config,
                query,
@ -518,7 +419,7 @@ def run_single_experiment(
                mask_kwargs,
                backend,
            )
-        else:  # sdpa (also supports fav2)
+        else:  # sdpa
            results[backend] = run_single_backend_sdpa(
                config,
                query,
@ -539,7 +440,7 @@ def run_single_experiment(
    sparsity = block_mask.sparsity() / 100.0 if block_mask is not None else 0.0
    sparsity = sparsity if config.attn_type != "document_mask" else 0.5

-    results["flex"] = ExperimentResults(
+    results["compiled"] = ExperimentResults(
        fwd_time=forward_compiled_time,
        bwd_time=backward_compile_time if config.calculate_bwd_time else None,
        sparsity=sparsity,
@ -600,15 +501,15 @@ def calculate_tflops(config: ExperimentConfig, results: ExperimentResults) -> fl
    softmax_flops = M * N * 2  # Not counting online softmax overhead
    o_flops = M * D * N * 2
    # Not counting split k overhead
-    sparsity = results.sparsity if results.sparsity is not None else 0.0
-    total_flops = B * Hq * (qk_flops + softmax_flops + o_flops) * (1 - sparsity)
+    total_flops = B * Hq * (qk_flops + softmax_flops + o_flops) * (1 - results.sparsity)
    return total_flops / results.fwd_time / 1e6  # in TFLOPs/


 def get_average_speedups(results: list[Experiment], type: str, backend: str):
    # Calculate speedups
    speedups = [
-        calculate_speedup(r.results["flex"], r.results[backend], type) for r in results
+        calculate_speedup(r.results["compiled"], r.results[backend], type)
+        for r in results
    ]

    # Find indices of max and min speedups
@ -636,7 +537,7 @@ def get_average_speedups(results: list[Experiment], type: str, backend: str):
 def print_results(results: list[Experiment], save_path: Optional[str] = None):
    table_data = defaultdict(list)
    for experiment in results:
-        backends = experiment.config.backends + ["flex"]
+        backends = experiment.config.backends + ["compiled"]
        for key, value in experiment.asdict().items():
            if key in backends:
                if value.fwd_time:
@ -649,43 +550,45 @@ def print_results(results: list[Experiment], save_path: Optional[str] = None):
    # Calculate speedups
    for backend in results[0].config.backends:
        fwd_speedups = [
-            calculate_speedup(r.results["flex"], r.results[backend], type="fwd")
+            calculate_speedup(r.results["compiled"], r.results[backend], type="fwd")
            for r in results
        ]
-        table_data[f"fwd_speedup_flex_over_{backend}"] = fwd_speedups
+        table_data[f"fwd_{backend}_speedup"] = fwd_speedups

    if results[0].config.calculate_bwd_time:
        for backend in results[0].config.backends:
            bwd_speedups = [
-                calculate_speedup(r.results["flex"], r.results[backend], type="bwd")
+                calculate_speedup(r.results["compiled"], r.results[backend], type="bwd")
                for r in results
            ]
-            table_data[f"bwd_speedup_flex_over_{backend}"] = bwd_speedups
+            table_data[f"bwd_{backend}_speedup"] = bwd_speedups

    # Calculate mem + computational throughput
    if results[0].config.cal_bandwidth:
        fwd_bandwidth = [
-            calculate_bandwidth(r.config, r.results["flex"], type="fwd")
+            calculate_bandwidth(r.config, r.results["compiled"], type="fwd")
            for r in results
        ]
        table_data["fwd_mem_bw (TB/s)"] = fwd_bandwidth
-        fwd_tflops = [calculate_tflops(r.config, r.results["flex"]) for r in results]
+        fwd_tflops = [
+            calculate_tflops(r.config, r.results["compiled"]) for r in results
+        ]
        table_data["TFlops/s"] = fwd_tflops

    print(tabulate(table_data, headers="keys", tablefmt="github", floatfmt=".3f"))

    for backend in results[0].config.backends:
-        if np.isnan(table_data[f"fwd_speedup_flex_over_{backend}"]).all():
+        if np.isnan(table_data[f"fwd_{backend}_speedup"]).all():
            continue
        print("\n")
-        print(f"FWD Speedup of Flex over {backend}".center(125, "="))
+        print(f"FWD Speedups vs. {backend}".center(125, "="))
        print("\n")
        average_data = get_average_speedups(results, type="fwd", backend=backend)
        print(tabulate(average_data, headers="keys", tablefmt="github", floatfmt=".3f"))

        if results[0].config.calculate_bwd_time:
            print("\n")
-            print(f"BWD Speedup of Flex over {backend}".center(125, "="))
+            print(f"BWD Speedups vs. {backend}".center(125, "="))
            print("\n")
            average_data = get_average_speedups(results, type="bwd", backend=backend)
            print(
@ -888,14 +791,14 @@ def get_backend_context(backend: str):
    Returns a context manager for the specified backend.
    Args:
        backend (str): The name of the backend to use.
-                       Valid options are 'math', 'efficient', 'cudnn', 'fav2', 'fav3', 'fakv', 'og-eager'.
+                       Valid options are 'fav2', 'cudnn', 'math', 'efficient', 'fav3', 'fakv', 'og-eager'.
    Returns:
        A context manager for the specified backend.
    Raises:
        ValueError: If an invalid backend is specified.
    """
    backends = {
-        "fav2": sdpa_kernel(SDPBackend.FLASH_ATTENTION),
+        "fav2": nullcontext(),
        "cudnn": sdpa_kernel(SDPBackend.CUDNN_ATTENTION),
        "math": sdpa_kernel(SDPBackend.MATH),
        "efficient": sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION),
@ -917,7 +820,15 @@ def generate_FA_callable(
 ) -> Callable | None:
    if dtype not in [torch.float16, torch.bfloat16]:
        return None
-    if backend == "fav3":
+    if backend == "fav2":
+        try:
+            from flash_attn import flash_attn_func, flash_attn_varlen_func
+        except ImportError:
+            print(
+                "Flash attention 2 is not installed. Please install it to run fav2 backend. "
+            )
+            raise
+    elif backend == "fav3":
        try:
            from flash_attn.flash_attn_interface import (
                flash_attn_func,
@ -1123,7 +1034,6 @@ def generate_experiment_configs(
    kv_cache_size: list[int],
    cal_bandwidth: bool,
    backends: list[str],
-    max_autotune: bool,
 ) -> list[ExperimentConfig]:
    assert not (calculate_bwd and decoding), "Decoding does not support backward"

@ -1167,333 +1077,52 @@ def generate_experiment_configs(
                calculate_bwd_time=calculate_bwd,
                cal_bandwidth=cal_bandwidth,
                backends=backends,
-                max_autotune=max_autotune,
            )
        )

    return all_configs


-def _output_json_for_dashboard(
-    experiments,
-    output_file,
-    benchmark_name="PyTorch operator microbenchmark",
-):
-    """
-    Write the result into JSON format for PyTorch OSS dashboard.
-    The JSON format is defined at
-    https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
-
-    Args:
-        experiments: List of experiment results
-        output_file: Path to output JSON file
-        benchmark_name: Name of the benchmark
-    """
-    if not experiments:
-        return
-
-    import math
-    import platform
-    from dataclasses import asdict, dataclass
-    from typing import Any, Optional
-
-    # Prepare headers and records for JSON output
-    records = []
-    for experiment in experiments:
-        config = experiment.config
-        results_dict = (
-            experiment.results
-        )  # This is a dict: backend -> ExperimentResults
-
-        # Process each backend result
-        for backend, results in results_dict.items():
-            # Skip backends that were not run (NaN results)
-            if math.isnan(results.fwd_time):
-                continue
-
-            # Extract data from experiment
-            test_name = f"{backend}_{config.attn_type}_"
-            input_config = f"shape: {config.shape}, dtype: {config.dtype}"
-
-            # Determine mode based on backward pass
-            mode = "training" if config.calculate_bwd_time else "inference"
-
-            # Extract dtype
-            dtype = (
-                str(config.dtype).split(".")[1]
-                if "." in str(config.dtype)
-                else str(config.dtype)
-            )
-
-            # Determine device
-            device = "cuda"
-
-            # Get device architecture
-            device_arch = (
-                torch.cuda.get_device_name(0)
-                if device == "cuda"
-                else platform.processor()
-                if device == "cpu"
-                else "unknown"
-            )
-
-            # Create dataclasses for JSON structure
-            @dataclass
-            class BenchmarkInfo:
-                name: str
-                mode: Optional[str]
-                dtype: str
-                extra_info: dict[str, Any]
-
-            @dataclass
-            class ModelInfo:
-                name: str
-                type: str
-                origins: list[str]
-                extra_info: dict[str, Any]
-
-            @dataclass
-            class MetricInfo:
-                name: str
-                unit: str
-                benchmark_values: list[float]
-                target_value: Optional[float]
-
-            @dataclass
-            class BenchmarkRecord:
-                benchmark: BenchmarkInfo
-                model: ModelInfo
-                metric: MetricInfo
-
-            # Benchmark extra info
-            benchmark_extra_info = {
-                "input_config": input_config,
-                "device": device,
-                "arch": device_arch,
-                "operator_name": backend,
-                "attn_type": config.attn_type,
-                "shape": str(config.shape),
-                "max_autotune": config.max_autotune,
-            }
-            # Add record for forward latency
-            record_fwd_latency = BenchmarkRecord(
-                benchmark=BenchmarkInfo(
-                    name=benchmark_name,
-                    mode=mode,
-                    dtype=dtype,
-                    extra_info=benchmark_extra_info,
-                ),
-                model=ModelInfo(
-                    name=test_name + str(config.shape),
-                    type="attention-benchmark",
-                    origins=["pytorch"],
-                    extra_info={
-                        "operator_name": backend,
-                        "attn_type": config.attn_type,
-                    },
-                ),
-                metric=MetricInfo(
-                    name="forward latency",
-                    unit="us",
-                    benchmark_values=[results.fwd_time],
-                    target_value=None,
-                ),
-            )
-            records.append(asdict(record_fwd_latency))
-
-            # Add record for forward memory bandwidth (if available)
-            if config.cal_bandwidth:
-                record_fwd_bandwidth = BenchmarkRecord(
-                    benchmark=BenchmarkInfo(
-                        name=benchmark_name,
-                        mode=mode,
-                        dtype=dtype,
-                        extra_info=benchmark_extra_info,
-                    ),
-                    model=ModelInfo(
-                        name=test_name + str(config.shape),
-                        type="attention-benchmark",
-                        origins=["pytorch"],
-                        extra_info={
-                            "operator_name": backend,
-                        },
-                    ),
-                    metric=MetricInfo(
-                        name="memory bandwidth",
-                        unit="TB/s",
-                        benchmark_values=[calculate_bandwidth(config, results, "fwd")],
-                        target_value=None,
-                    ),
-                )
-                records.append(asdict(record_fwd_bandwidth))
-
-            # Add record for forward TFLOPS (if available)
-            if config.cal_bandwidth:
-                record_fwd_tflops = BenchmarkRecord(
-                    benchmark=BenchmarkInfo(
-                        name=benchmark_name,
-                        mode=mode,
-                        dtype=dtype,
-                        extra_info=benchmark_extra_info,
-                    ),
-                    model=ModelInfo(
-                        name=test_name + str(config.shape),
-                        type="attention-benchmark",
-                        origins=["pytorch"],
-                        extra_info={
-                            "operator_name": backend,
-                        },
-                    ),
-                    metric=MetricInfo(
-                        name="tflops",
-                        unit="TFLOPS/s",
-                        benchmark_values=[calculate_tflops(config, results)],
-                        target_value=None,
-                    ),
-                )
-                records.append(asdict(record_fwd_tflops))
-
-            # Add record for backward latency (if available and not NaN)
-            if (
-                config.calculate_bwd_time
-                and results.bwd_time is not None
-                and not math.isnan(results.bwd_time)
-            ):
-                record_bwd_latency = BenchmarkRecord(
-                    benchmark=BenchmarkInfo(
-                        name=benchmark_name,
-                        mode=mode,
-                        dtype=dtype,
-                        extra_info=benchmark_extra_info,
-                    ),
-                    model=ModelInfo(
-                        name=test_name + str(config.shape),
-                        type="attention-benchmark",
-                        origins=["pytorch"],
-                        extra_info={
-                            "operator_name": backend,
-                        },
-                    ),
-                    metric=MetricInfo(
-                        name="backward latency",
-                        unit="us",
-                        benchmark_values=[results.bwd_time],
-                        target_value=None,
-                    ),
-                )
-                records.append(asdict(record_bwd_latency))
-
-    # Write all records to the output file
-    with open(output_file, "w", encoding="utf-8") as f:
-        json.dump(records, f, indent=2)
-
-
-def main(
-    dynamic: bool = False,
-    calculate_bwd: bool = False,
-    dtype: DtypeString = "bfloat16",
-    b: list[int] | None = None,
-    nh: list[str] | None = None,
-    s: list[int] | None = None,
-    d: list[int] | None = None,
-    mods: list[AttentionType] | None = None,
-    backend: list[Backend] | None = None,
-    max_autotune: bool = False,
-    decoding: bool = False,
-    kv_size: Optional[list[int]] = None,
-    throughput: bool = True,
-    save_path: Optional[str] = None,
-    output_json_for_dashboard: Optional[str] = None,
-    benchmark_name: str = "PyTorch operator microbenchmark",
-) -> None:
-    """Run sweep over sizes and score mods for flex attention.
-
-    Usage Examples:
-        # Use a yml config file
-        python score_mod.py --config basic_config.yaml
-
-        # Use a json config file
-        python score_mod.py --config my_config.json
-
-        # Generate a config template
-        python score_mod.py --print-config json > my_config.json # For a json config
-        python score_mod.py --print-config yaml > my_config.yaml # For a yaml config
-
-        # Override config with CLI args
-        python score_mod.py --config my_config.json -dtype float16 --max-autotune
-
-        # Pure CLI usage
-        python score_mod.py -b 4 8 -s 1024 2048 -mods causal alibi --backend efficient
-
-    Args:
-        dynamic: Runs a dynamic shapes version of compiled flex attention
-        calculate_bwd: Calculate backward pass times
-        dtype: Data type for tensors (bfloat16, float16, float32)
-        b: Batch sizes to benchmark
-        nh: Number of query and key/value heads in format "Hq,Hkv"
-        s: Sequence lengths to benchmark
-        d: Head dimensions to benchmark
-        mods: Score modifications: noop, causal, rel, head_bias, alibi, sliding_window, document_mask, prefix_lm, softcap
-        backend: Backends for attention computation: math, efficient, cudnn, fav2, fav3, fakv, og-eager
-        max_autotune: Turn on max-autotune optimization
-        decoding: Benchmark decoding mode (query sequence length = 1)
-        kv_size: Key/value cache size in MiB (ignores batch size if specified)
-        throughput: Calculate kernel memory bandwidth & computational throughput (always True)
-        save_path: Path to save the results CSV file
-        output_json_for_dashboard: Path to save results in JSON format for PyTorch OSS dashboard
-        benchmark_name: Name of the benchmark for dashboard output
-    """
-    # Convert dtype string to torch dtype (if not already converted)
-    import torch
-
-    if isinstance(dtype, str):
-        dtype = getattr(torch, dtype)
-
-    # Always calculate throughput
-    throughput = True
-    print("Backend: ", backend)
+def main(args):
    seed = 123
    np.random.seed(seed)
    torch.manual_seed(seed)
    results = []
-    for experiment_count, config in enumerate(
-        tqdm(
-            generate_experiment_configs(
-                calculate_bwd,
-                dtype,
-                b,
-                nh,
-                s,
-                d,
-                mods,
-                decoding,
-                kv_size,
-                throughput,
-                backend,
-                max_autotune,
-            )
-        ),
-        start=1,
+    for config in tqdm(
+        generate_experiment_configs(
+            args.calculate_bwd,
+            args.dtype,
+            args.b,
+            args.nh,
+            args.s,
+            args.d,
+            args.mods,
+            args.decoding,
+            args.kv_size,
+            args.throughput,
+            args.backend,
+        )
    ):
        results.append(
            Experiment(
                config,
                run_single_experiment(
                    config,
-                    dynamic=dynamic,
+                    dynamic=args.dynamic,
+                    max_autotune=args.max_autotune,
                ),
            )
        )

-        # Periodic memory cleanup every 50 experiments
-        if experiment_count % 50 == 0:
-            cleanup_memory()
+    print_results(results, args.save_path)

-    print_results(results, save_path)

-    # Output JSON for dashboard if requested
-    if output_json_for_dashboard:
-        _output_json_for_dashboard(results, output_json_for_dashboard, benchmark_name)
+def heads_input_type(s):
+    try:
+        hq, hkv = map(int, s.split(","))
+        return hq, hkv
+    except Exception as e:
+        raise argparse.ArgumentTypeError("Heads must be Hq,Hkv") from e


 if __name__ == "__main__":
@ -1501,12 +1130,6 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Run sweep over sizes and score mods for flex attention"
    )
-    parser.add_argument(
-        "--config",
-        type=str,
-        help="Path to JSON config file. CLI args override config file values.",
-        default=None,
-    )
    parser.add_argument(
        "--dynamic",
        action="store_true",
@ -1576,49 +1199,8 @@ Ignores -b batch size and calculate batch size from kv size instead when specifi
        default=["efficient"],
        help="Backend to use for attention computation",
    )
-    parser.add_argument(
-        "--output-json-for-dashboard",
-        type=str,
-        help="Path to save results in JSON format for PyTorch OSS dashboard",
-        default=None,
-    )
-    parser.add_argument(
-        "--benchmark-name",
-        type=str,
-        help="Name of the benchmark for dashboard output",
-        default="PyTorch operator microbenchmark",
-    )
-    parser.add_argument(
-        "--print-config",
-        type=str,
-        choices=["json", "yaml"],
-        help="Print a default config template in JSON or YAML format and exit",
-        default=None,
-    )
    # Parse arguments
    args = parser.parse_args()
+    args.dtype = getattr(torch, args.dtype)

-    # Handle --print-config
-    if args.print_config:
-        print_default_config(args.print_config)
-        sys.exit(0)
-
-    # Load and merge config if provided
-    if args.config:
-        config = load_config_file(args.config)
-
-        # Merge config with CLI args (CLI args take precedence)
-        json_args = argparse.Namespace()
-        json_args.__dict__ = config
-        args = parser.parse_args(namespace=json_args)
-
-    # Convert dtype string to torch dtype (only if it's still a string)
-    if isinstance(args.dtype, str):
-        args.dtype = getattr(torch, args.dtype)
-
-    # Remove config and print_config from args before passing to main
-    args_dict = vars(args)
-    args_dict.pop("config", None)
-    args_dict.pop("print_config", None)
-
-    main(**args_dict)
+    main(args)
--- a/build_variables.bzl
+++ b/build_variables.bzl
@ -482,7 +482,6 @@ inductor_core_resources = [
    "torch/csrc/inductor/aoti_torch/oss_proxy_executor.cpp",
    "torch/csrc/inductor/inductor_ops.cpp",
    "torch/csrc/jit/serialization/pickle.cpp",
-    "torch/csrc/shim_common.cpp",
 ]

 libtorch_core_sources = sorted(
--- a/c10/core/SymInt.h
+++ b/c10/core/SymInt.h
@ -556,26 +556,3 @@ inline SymBool sym_ge(const SymInt& a, const SymInt& b) {
 }

 } // namespace c10
-
-#include <limits>
-
-namespace std {
-
-template <>
-class numeric_limits<c10::SymInt> {
- public:
-  static constexpr bool is_specialized = true;
-
-  static constexpr int64_t max() noexcept {
-    return std::numeric_limits<int64_t>::max();
-  }
-
-  static constexpr int64_t min() noexcept {
-    return std::numeric_limits<int64_t>::min();
-  }
-
-  static constexpr bool is_signed = true;
-  static constexpr bool is_integer = true;
-};
-
-} // namespace std
--- a/c10/metal/special_math.h
+++ b/c10/metal/special_math.h
@ -1,4 +1,4 @@
-// Implementation of special math functions for Metal
+// Implementation of specal math functions for Metal
 #pragma once
 #include <c10/metal/expm1f.h>
 #include <c10/metal/igamma.h>
@ -624,64 +624,6 @@ inline T spherical_bessel_j0(T x) {
  return static_cast<T>(::metal::sin(x) / x);
 }

-template <typename T>
-inline ::metal::enable_if_t<is_scalar_floating_point_v<T>, T> logaddexp(
-    T a,
-    T b) {
-  float a0 = static_cast<float>(a);
-  float b0 = static_cast<float>(b);
-  if (::metal::isinf(a0) && a0 == b0) {
-    return static_cast<T>(a0);
-  } else {
-    float m0 = ::metal::max(a0, b0);
-    return static_cast<T>(
-        m0 + ::c10::metal::log1p(::metal::exp(-::metal::abs(a0 - b0))));
-  }
-}
-
-// The function is ported from mlx
-template <typename T>
-inline ::metal::enable_if_t<is_complex_v<T>, T> logaddexp(T a, T b) {
-  if (::metal::isnan(a.x) || ::metal::isnan(a.y) || ::metal::isnan(b.x) ||
-      ::metal::isnan(b.y)) {
-    return T(NAN, NAN);
-  }
-
-  T maxval = a.x > b.x ? a : b;
-  T minval = a.x < b.x ? a : b;
-  constexpr auto inf = ::metal::numeric_limits<T>::infinity().x;
-
-  if (minval.x == -inf || maxval.x == inf) {
-    return maxval;
-  }
-
-  float2 maxval_ = static_cast<float2>(maxval);
-  float2 minval_ = static_cast<float2>(minval);
-  float m = ::metal::exp(minval_.x - maxval_.x);
-  float2 dexp{
-      m * ::metal::cos(minval_.y - maxval_.y),
-      m * ::metal::sin(minval_.y - maxval_.y),
-  };
-  return static_cast<T>(maxval_ + ::c10::metal::log1p(dexp));
-}
-
-template <typename T>
-inline T logaddexp2(T a, T b) {
-  constexpr auto log_2 = float(0.693147180559945309417232121458176);
-  constexpr auto inv_log_2 = float(1) / log_2;
-  float a0 = static_cast<float>(a);
-  float b0 = static_cast<float>(b);
-  if (::metal::isinf(a0) && a0 == b0) {
-    return static_cast<T>(a0);
-  } else {
-    float m0 = ::metal::max(a0, b0);
-    return static_cast<T>(
-        m0 +
-        ::c10::metal::log1p(::metal::pow(float(2), -::metal::abs(a0 - b0))) *
-            inv_log_2);
-  }
-}
-
 template <typename T>
 inline float xlog1py(T x, T y) {
  if (::metal::isnan(y)) {
--- a/c10/metal/utils.h
+++ b/c10/metal/utils.h
@ -322,24 +322,6 @@ inline float log1p(float x) {
  return rc;
 }

-// The function is ported from mlx
-inline float2 log1p(float2 in) {
-  float x = in.x;
-  float y = in.y;
-  float zabs = ::metal::precise::sqrt(x * x + y * y);
-  float theta = ::metal::atan2(y, x + 1);
-  if (zabs < 0.5f) {
-    float r = x * (2 + x) + y * y;
-    if (r == 0) { // handle underflow
-      return {x, theta};
-    }
-    return {0.5f * log1p(r), theta};
-  } else {
-    auto z0 = ::metal::sqrt((x + 1) * (x + 1) + y * y);
-    return {::metal::log(z0), theta};
-  }
-}
-
 template <typename T1, typename T2 = T1>
 struct pair {
  T1 first;
--- a/c10/mobile/CPUProfilingAllocator.cpp
+++ b/c10/mobile/CPUProfilingAllocator.cpp
@ -34,7 +34,7 @@ struct MemEvent {
 bool overlaps(const MemBlock& a, const MemBlock& b) {
  // two blocks dont overlap if
  // |---a--------|--------------b--------|
-  // start_a     end_a <= start_b       end_b
+  // strat_a     end_a <= start_b       end_b
  return !(
      (a.end_offset <= b.start_offset) || (b.end_offset <= a.start_offset));
 }
--- a/c10/util/Bitset.h
+++ b/c10/util/Bitset.h
@ -33,7 +33,7 @@ struct bitset final {
  constexpr bitset() noexcept = default;
  constexpr bitset(const bitset&) noexcept = default;
  constexpr bitset(bitset&&) noexcept = default;
-  // there is an issue for gcc 5.3.0 when define default function as constexpr
+  // there is an issure for gcc 5.3.0 when define default function as constexpr
  // see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=68754.
  bitset& operator=(const bitset&) noexcept = default;
  bitset& operator=(bitset&&) noexcept = default;
--- a/c10/xpu/XPUCachingAllocator.cpp
+++ b/c10/xpu/XPUCachingAllocator.cpp
@ -123,8 +123,6 @@ class DeviceCachingAllocator {
  ska::flat_hash_map<xpu::XPUStream, std::deque<std::pair<sycl::event, Block*>>>
      xpu_events;
  DeviceIndex device_index;
-  size_t allowed_memory_maximum = 0;
-  bool set_fraction = false;

  size_t try_merge_blocks(Block* dst, Block* src, BlockPool& pool) {
    if (!src || src->allocated || src->event_count > 0 ||
@ -247,12 +245,6 @@ class DeviceCachingAllocator {
    if (isRetry) {
      stats.num_alloc_retries += 1;
    }
-    if (set_fraction &&
-        stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)].current +
-                size >
-            allowed_memory_maximum) {
-      return false;
-    }
    void* ptr = sycl::aligned_alloc_device(
        kDeviceAlignment,
        size,
@ -443,11 +435,6 @@ class DeviceCachingAllocator {
        device_free =
            raw_device.get_info<sycl::ext::intel::info::device::free_memory>();
      }
-      std::string allowed_info;
-      if (set_fraction) {
-        allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
-      }
-
      auto allocated_bytes =
          stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)]
              .current;
@ -472,9 +459,7 @@ class DeviceCachingAllocator {
          format_size(device_total),
          " of which ",
          format_size(device_free),
-          " is free. ",
-          allowed_info,
-          "Of the allocated memory ",
+          " is free. Of the allocated memory ",
          format_size(allocated_bytes),
          " is allocated by PyTorch, and ",
          format_size(reserved_bytes - allocated_bytes),
@ -553,25 +538,6 @@ class DeviceCachingAllocator {
      stats.requested_bytes[statType].reset_peak();
    }
  }
-
-  double getMemoryFraction() {
-    if (!set_fraction) {
-      return 1.0;
-    }
-
-    c10::xpu::DeviceProp device_prop;
-    c10::xpu::get_device_properties(&device_prop, device_index);
-    return static_cast<double>(allowed_memory_maximum) /
-        static_cast<double>(device_prop.global_mem_size);
-  }
-
-  void setMemoryFraction(double fraction) {
-    c10::xpu::DeviceProp device_prop;
-    c10::xpu::get_device_properties(&device_prop, device_index);
-    auto device_total = device_prop.global_mem_size;
-    allowed_memory_maximum = static_cast<size_t>(fraction * device_total);
-    set_fraction = true;
-  }
 };

 static void local_raw_delete(void* ptr);
@ -734,21 +700,6 @@ class XPUAllocator : public DeviceAllocator {
    assertValidDevice(device);
    device_allocators[device]->resetAccumulatedStats();
  }
-
-  double getMemoryFraction(DeviceIndex device) {
-    assertValidDevice(device);
-    return device_allocators[device]->getMemoryFraction();
-  }
-
-  void setMemoryFraction(double fraction, DeviceIndex device) {
-    assertValidDevice(device);
-    TORCH_CHECK_VALUE(
-        0 < fraction && fraction <= 1,
-        "invalid fraction:",
-        fraction,
-        ". Please set within (0, 1].");
-    device_allocators[device]->setMemoryFraction(fraction);
-  }
 };

 static XPUAllocator allocator;
@ -793,14 +744,6 @@ void recordStream(const DataPtr& dataPtr, XPUStream stream) {
  return allocator.recordStream(dataPtr, stream);
 }

-double getMemoryFraction(DeviceIndex device) {
-  return allocator.getMemoryFraction(device);
-}
-
-void setMemoryFraction(double fraction, DeviceIndex device) {
-  return allocator.setMemoryFraction(fraction, device);
-}
-
 REGISTER_ALLOCATOR(kXPU, &allocator)

 } // namespace c10::xpu::XPUCachingAllocator
--- a/c10/xpu/XPUCachingAllocator.h
+++ b/c10/xpu/XPUCachingAllocator.h
@ -25,8 +25,4 @@ C10_XPU_API void raw_delete(void* ptr);

 C10_XPU_API void recordStream(const DataPtr& dataPtr, XPUStream stream);

-C10_XPU_API double getMemoryFraction(DeviceIndex device);
-
-C10_XPU_API void setMemoryFraction(double fraction, DeviceIndex device);
-
 } // namespace c10::xpu::XPUCachingAllocator
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -1358,15 +1358,9 @@ if(BUILD_TEST)
    )
  else()
    add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
-    add_subdirectory(${TORCH_ROOT}/test/cpp/lazy ${CMAKE_BINARY_DIR}/test_lazy)
    # NativeRT is disabled
    # add_subdirectory(${TORCH_ROOT}/test/cpp/nativert ${CMAKE_BINARY_DIR}/test_nativert)
    add_subdirectory(${TORCH_ROOT}/test/inductor ${CMAKE_BINARY_DIR}/test_inductor)
-    add_subdirectory(${TORCH_ROOT}/test/cpp/aoti_abi_check ${CMAKE_BINARY_DIR}/test_aoti_abi_check)
-    if(BUILD_AOT_INDUCTOR_TEST)
-      add_subdirectory(${TORCH_ROOT}/test/cpp/aoti_inference ${CMAKE_BINARY_DIR}/test_aoti_inference)
-    endif()
-
    if(USE_DISTRIBUTED)
      add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
      if(NOT WIN32)
@ -1384,6 +1378,16 @@ if(BUILD_TEST)
        ${CMAKE_BINARY_DIR}/test_mobile_nnc
      )
    endif()
+    add_subdirectory(${TORCH_ROOT}/test/cpp/lazy
+                     ${CMAKE_BINARY_DIR}/test_lazy)
+  endif()
+  if(BUILD_AOT_INDUCTOR_TEST)
+    add_subdirectory(
+      ${TORCH_ROOT}/test/cpp/aoti_abi_check
+      ${CMAKE_BINARY_DIR}/test_aoti_abi_check)
+    add_subdirectory(
+      ${TORCH_ROOT}/test/cpp/aoti_inference
+      ${CMAKE_BINARY_DIR}/test_aoti_inference)
  endif()
 endif()

--- a/caffe2/serialize/crc_alt.h
+++ b/caffe2/serialize/crc_alt.h
@ -38,7 +38,7 @@ uint32_t crc32_combine (uint32_t crcA, uint32_t crcB, size_t lengthB);

 /// compute CRC32 (bitwise algorithm)
 uint32_t crc32_bitwise (const void* data, size_t length, uint32_t previousCrc32 = 0);
-/// compute CRC32 (half-byte algorithm)
+/// compute CRC32 (half-byte algoritm)
 uint32_t crc32_halfbyte(const void* data, size_t length, uint32_t previousCrc32 = 0);

 #ifdef CRC32_USE_LOOKUP_TABLE_BYTE
@ -96,7 +96,7 @@ uint32_t crc32_16bytes_prefetch(const void* data, size_t length, uint32_t previo
  #define __BIG_ENDIAN    4321
 #endif

-// define endianness and some integer data types
+// define endianess and some integer data types
 #if defined(_MSC_VER) || defined(__MINGW32__)
  // Windows always little endian
  #define __BYTE_ORDER __LITTLE_ENDIAN
@ -168,7 +168,7 @@ namespace
  /// zlib's CRC32 polynomial
  const uint32_t Polynomial = 0xEDB88320;

-  /// swap endianness
+  /// swap endianess
  static inline uint32_t swap(uint32_t x)
  {
  #if defined(__GNUC__) || defined(__clang__)
@ -229,7 +229,7 @@ uint32_t crc32_bitwise(const void* data, size_t length, uint32_t previousCrc32)
 }


-/// compute CRC32 (half-byte algorithm)
+/// compute CRC32 (half-byte algoritm)
 uint32_t crc32_halfbyte(const void* data, size_t length, uint32_t previousCrc32)
 {
  uint32_t crc = ~previousCrc32; // same as previousCrc32 ^ 0xFFFFFFFF
@ -662,7 +662,7 @@ uint32_t crc32_combine(uint32_t crcA, uint32_t crcB, size_t lengthB)
  // - if you append length(B) zeros to A and call it A' (think of it as AAAA000)
  //   and   prepend length(A) zeros to B and call it B' (think of it as 0000BBB)
  //   then exists a C' = A' ^ B'
-  // - remember: if you XOR something with zero, it remains unchanged: X ^ 0 = X
+  // - remember: if you XOR someting with zero, it remains unchanged: X ^ 0 = X
  // - that means C' = A concat B so that crc(A concat B) = crc(C') = crc(A') ^ crc(B')
  // - the trick is to compute crc(A') based on crc(A)
  //                       and crc(B') based on crc(B)
--- a/caffe2/serialize/inline_container.h
+++ b/caffe2/serialize/inline_container.h
@ -76,7 +76,7 @@ typedef struct mz_zip_archive mz_zip_archive;
 // 2) Writing with 1-pass sequential access
 //      -> We must take care not to require updating values that have already
 //         been written. We place the variable-length index at the end and do
-//         not put any index into the header to fulfill this constraint.
+//         not put any indicies into the header to fulfill this constraint.

 // The model.json, which contains all the metadata information,
 // should be written as the last file. One reason is that the size of tensor
--- a/caffe2/serialize/inline_container_test.cc
+++ b/caffe2/serialize/inline_container_test.cc
@ -519,7 +519,7 @@ TEST(PyTorchStreamWriterAndReader, SaveAndLoadWithAllocator) {
  std::tie(data_ptr, size) = reader.getRecord("key1", &overrideAllocator);
  EXPECT_EQ(overrideAllocator.getAllocatedBytes(), kBytes1);
  EXPECT_EQ(baseAllocator.getAllocatedBytes(), allocBytes);
-  // allocate with base allocator
+  // allcoate with base allocator
  std::tie(data_ptr, size) = reader.getRecord("key1");
  EXPECT_EQ(overrideAllocator.getAllocatedBytes(), kBytes1);
  EXPECT_EQ(baseAllocator.getAllocatedBytes(), allocBytes + kBytes1);
--- a/cmake/public/utils.cmake
+++ b/cmake/public/utils.cmake
@ -383,7 +383,7 @@ function(torch_compile_options libname)
      -Wno-strict-aliasing
      )
    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-      list(APPEND private_compile_options -Wredundant-move -Wno-interference-size)
+      list(APPEND private_compile_options -Wredundant-move)
    endif()
    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
      list(APPEND private_compile_options -Wextra-semi -Wmove)
--- a/docs/cpp/source/index.rst
+++ b/docs/cpp/source/index.rst
@ -14,7 +14,7 @@ Combining, these building blocks form a research and
 production ready C++ library for tensor computation and dynamic neural
 networks with strong emphasis on GPU acceleration as well as fast CPU
 performance. It is currently in use at Facebook in research and
-production; we are looking forward to welcoming more users of the PyTorch C++ API.
+production; we are looking forward to welcome more users of the PyTorch C++ API.

 .. warning::

--- a/Show More
+++ b/Show More