Update (base update)

[ghstack-poisoned]
2025-11-12 23:14:32 +08:00 · 2025-11-06 12:55:56 -08:00
617 changed files with 6471 additions and 15747 deletions
--- a/.ci/docker/almalinux/build.sh
+++ b/.ci/docker/almalinux/build.sh
@ -36,7 +36,11 @@ case ${DOCKER_TAG_PREFIX} in
    ;;
  rocm*)
    BASE_TARGET=rocm
-    PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151"
+    PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+    # add gfx950, gfx115x conditionally starting in ROCm 7.0
+    if [[ "$ROCM_VERSION" == *"7.0"* ]]; then
+        PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
+    fi
    EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
    ;;
  *)
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -168,18 +168,6 @@ case "$tag" in
    VISION=yes
    TRITON=yes
    ;;
-  pytorch-linux-jammy-py3.11-clang12)
-    ANACONDA_PYTHON_VERSION=3.11
-    CLANG_VERSION=12
-    VISION=no
-    TRITON=no
-    ;;
-  pytorch-linux-jammy-py3.12-clang12)
-    ANACONDA_PYTHON_VERSION=3.12
-    CLANG_VERSION=12
-    VISION=no
-    TRITON=no
-    ;;
  pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-jammy-rocm-n-py3-benchmarks | pytorch-linux-noble-rocm-n-py3)
    if [[ $tag =~ "jammy" ]]; then
      ANACONDA_PYTHON_VERSION=3.10
@ -207,9 +195,9 @@ case "$tag" in
    NINJA_VERSION=1.9.0
    TRITON=yes
    ;;
-  pytorch-linux-noble-xpu-n-py3 | pytorch-linux-noble-xpu-n-py3-inductor-benchmarks)
+  pytorch-linux-jammy-xpu-n-py3 | pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks)
    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=13
+    GCC_VERSION=11
    VISION=yes
    XPU_VERSION=2025.2
    NINJA_VERSION=1.9.0
@ -260,12 +248,6 @@ case "$tag" in
    HALIDE=yes
    TRITON=yes
    ;;
-  pytorch-linux-jammy-cuda12.8-py3.12-pallas)
-    CUDA_VERSION=12.8.1
-    ANACONDA_PYTHON_VERSION=3.12
-    GCC_VERSION=11
-    PALLAS=yes
-    ;;
  pytorch-linux-jammy-py3.12-triton-cpu)
    CUDA_VERSION=12.6
    ANACONDA_PYTHON_VERSION=3.12
@ -387,7 +369,6 @@ docker build \
       --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
       --build-arg "EXECUTORCH=${EXECUTORCH}" \
       --build-arg "HALIDE=${HALIDE}" \
-       --build-arg "PALLAS=${PALLAS}" \
       --build-arg "XPU_VERSION=${XPU_VERSION}" \
       --build-arg "UNINSTALL_DILL=${UNINSTALL_DILL}" \
       --build-arg "ACL=${ACL:-}" \
--- a/.ci/docker/ci_commit_pins/jax.txt
+++ b/.ci/docker/ci_commit_pins/jax.txt
@ -1 +0,0 @@
-0.8.0
--- a/.ci/docker/common/install_jax.sh
+++ b/.ci/docker/common/install_jax.sh
@ -1,40 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
-
-# Get the pinned JAX version (same for all CUDA versions)
-JAX_VERSION=$(get_pinned_commit /ci_commit_pins/jax)
-
-function install_jax_12() {
-  echo "Installing JAX ${JAX_VERSION} with CUDA 12 support"
-  pip_install "jax[cuda12]==${JAX_VERSION}" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
-
-  # Verify installation
-  python -c "import jax"  # check for errors
-  echo "JAX ${JAX_VERSION} installation completed successfully for CUDA 12"
-}
-
-function install_jax_13() {
-  echo "Installing JAX ${JAX_VERSION} with CUDA 13 support"
-  pip_install "jax[cuda13]==${JAX_VERSION}" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
-
-  # Verify installation
-  python -c "import jax"  # check for errors
-  echo "JAX ${JAX_VERSION} installation completed successfully for CUDA 13"
-}
-
-# idiomatic parameter and option handling in sh
-while test $# -gt 0
-do
-    case "$1" in
-    12.4|12.6|12.6.*|12.8|12.8.*|12.9|12.9.*) install_jax_12;
-        ;;
-    13.0|13.0.*) install_jax_13;
-        ;;
-    *) echo "bad argument $1"; exit 1
-        ;;
-    esac
-    shift
-done
--- a/.ci/docker/common/install_libgomp.sh
+++ b/.ci/docker/common/install_libgomp.sh
@ -1,56 +0,0 @@
-#!/bin/bash
-# Script used only in CD pipeline
-
-set -ex
-
-# install dependencies
-dnf -y install gmp-devel libmpc-devel texinfo flex bison
-
-cd /usr/local/src
-# fetch source for gcc 13
-git clone --depth 1 --single-branch -b releases/gcc-13.3.0 https://github.com/gcc-mirror/gcc.git gcc-13.3.0
-
-mkdir -p gcc-13.3.0/build-gomp
-cd gcc-13.3.0/build-gomp
-
-# configure gcc build
-# I got these flags by:
-# 1. downloading the source rpm for gcc-11 on AlmaLinux 8 container
-#    dnf install -y dnf-plugins-core rpmdevtools
-#   dnf download --source libgomp
-# 2. extracting the gcc.spec from the source.
-#    rpmdev-extract gcc-xx.src.rpm
-# 3. extracting optflags and ld_flags from gcc.spec:
-#    rpm --eval '%{optflags}'
-#    rpm --eval '%{build_ldflags}'
-#
-# I had to remove the following flags because they didn't compile for this version of libgomp:
-#   -Werror=format-security
-#   -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1
-#   -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1
-#
-# I added -march=armv8-a -mtune=generic to make them explicit. I don't think they're strictly needed.
-
-OPT_FLAGS='-O2 -march=armv8-a -mtune=generic'\
-' -fexceptions -g -grecord-gcc-switches -pipe -Wall'\
-' -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS'\
-' -fstack-protector-strong -fasynchronous-unwind-tables'\
-' -fstack-clash-protection'
-
-LDFLAGS='-Wl,-z,relro -Wl,--as-needed -Wl,-z,now'
-
-CFLAGS="$OPT_FLAGS" \
-CXXFLAGS="$OPT_FLAGS" \
-LDFLAGS="$LDFLAGS" \
-../configure \
-  --prefix=/usr \
-  --libdir=/usr/lib64 \
-  --enable-languages=c,c++ \
-  --disable-multilib \
-  --disable-bootstrap \
-  --enable-libgomp
-
-# only build libgomp
-make -j$(nproc) all-target-libgomp
-
-make install-target-libgomp
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -9,7 +9,7 @@ set -xe

 function install_ubuntu() {
    . /etc/os-release
-    if [[ ! " jammy noble " =~ " ${VERSION_CODENAME} " ]]; then
+    if [[ ! " jammy " =~ " ${VERSION_CODENAME} " ]]; then
        echo "Ubuntu version ${VERSION_CODENAME} not supported"
        exit
    fi
@ -35,24 +35,25 @@ function install_ubuntu() {
    # The xpu-smi packages
    apt-get install -y flex bison xpu-smi

-    # Compute and Media Runtimes
-    if [[ " ${VERSION_CODENAME} " =~ " noble " ]]; then
+    if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
+        # Compute and Media Runtimes
        apt-get install -y \
-            intel-opencl-icd libze-intel-gpu1 libze1 \
-            intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
-            libegl-mesa0 libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
+            intel-opencl-icd intel-level-zero-gpu level-zero \
+            intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \
+            libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
            libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
-            mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc
-    else # jammy
+            mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
+        # Development Packages
+        apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
+    else # rolling driver
        apt-get install -y \
            intel-opencl-icd libze-intel-gpu1 libze1 \
            intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
            libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
            libglapi-mesa libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
            mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc
+        apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev
    fi
-    # Development Packages
-    apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev

    # Install Intel Support Packages
    apt-get install -y ${XPU_PACKAGES}
@ -65,7 +66,7 @@ function install_ubuntu() {
 function install_rhel() {
    . /etc/os-release
    if [[ "${ID}" == "rhel" ]]; then
-        if [[ ! " 8.8 8.10 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
+        if [[ ! " 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
            echo "RHEL version ${VERSION_ID} not supported"
            exit
        fi
@ -146,7 +147,7 @@ function install_sles() {
 XPU_DRIVER_VERSION=""
 if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
    # Use GPU driver LTS releases
-    XPU_DRIVER_VERSION="/lts/2523"
+    XPU_DRIVER_VERSION="/lts/2350"
 fi

 # Default use Intel® oneAPI Deep Learning Essentials 2025.1
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -49,7 +49,11 @@ case ${DOCKER_TAG_PREFIX} in
        fi
        BASE_TARGET=rocm
        GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
-        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151"
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+        # add gfx950, gfx115x conditionally starting in ROCm 7.0
+        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
+            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
+        fi
        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
        ;;
    *)
--- a/.ci/docker/manywheel/Dockerfile_2_28_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64
@ -50,10 +50,6 @@ RUN rm install_ninja.sh
 ENV PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH

-# Build a newer version of libgomp than that supported in in Almalinux 8.
-COPY ./common/install_libgomp.sh install_libgomp.sh
-RUN bash ./install_libgomp.sh && rm install_libgomp.sh
-
 # git236+ would refuse to run git commands in repos owned by other users
 # Which causes version check to fail, as pytorch repo is bind-mounted into the image
 # Override this behaviour by treating every folder as safe
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -87,7 +87,11 @@ case ${image} in
        MANY_LINUX_VERSION="2_28"
        DEVTOOLSET_VERSION="11"
        GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
-        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151"
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+        # add gfx950, gfx115x conditionally starting in ROCm 7.0
+        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
+            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
+        fi
        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
        ;;
    manylinux2_28-builder:xpu)
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,11 +1,15 @@
-sphinx==7.2.6
+sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
-#Pinned versions: 7.2.6
+#Pinned versions: 5.3.0

-pytorch_sphinx_theme2==0.2.0
-#Description: This is needed to generate PyTorch docs
-#Pinned versions: 0.2.0
+standard-imghdr==3.13.0; python_version >= "3.13"
+#Description: This is needed by Sphinx, so it needs to be added here.
+# The reasons are as follows:
+# 1) This module has been removed from the Python standard library since Python 3.13(https://peps.python.org/pep-0594/#imghdr);
+# 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13.
+# Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency.

+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@71e55749be14ceb56e7f8211a9fb649866b87ad4#egg=pytorch_sphinx_theme2
 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
 # something related to Docker setup. We can investigate this later.
@ -32,17 +36,17 @@ tensorboard==2.18.0 ; python_version >= "3.13"
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 2.13.0

-breathe==4.36.0
+breathe==4.34.0
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 4.36.0
+#Pinned versions: 4.34.0

-exhale==0.3.7
+exhale==0.2.3
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 0.3.7
+#Pinned versions: 0.2.3

-docutils==0.20
+docutils==0.16
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 0.20
+#Pinned versions: 0.16

 bs4==0.0.1
 #Description: This is used to generate PyTorch C++ docs
@ -52,13 +56,13 @@ IPython==8.12.0
 #Description: This is used to generate PyTorch functorch docs
 #Pinned versions: 8.12.0

-myst-nb==1.3.0
+myst-nb==0.17.2
 #Description: This is used to generate PyTorch functorch and torch.compile docs.
-#Pinned versions: 1.3.0
+#Pinned versions: 0.17.2

 # The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
 python-etcd==0.4.5
 sphinx-copybutton==0.5.0
-sphinx-design==0.6.1
+sphinx-design==0.4.0
 sphinxcontrib-mermaid==1.0.0
-myst-parser==4.0.1
+myst-parser==0.18.1
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -143,15 +143,6 @@ COPY ci_commit_pins/halide.txt halide.txt
 RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
 RUN rm install_halide.sh common_utils.sh halide.txt

-ARG PALLAS
-ARG CUDA_VERSION
-# Install JAX with CUDA support (for Pallas)
-COPY ./common/install_jax.sh install_jax.sh
-COPY ./common/common_utils.sh common_utils.sh
-COPY ./ci_commit_pins/jax.txt /ci_commit_pins/jax.txt
-RUN if [ -n "${PALLAS}" ]; then bash ./install_jax.sh ${CUDA_VERSION}; fi
-RUN rm -f install_jax.sh common_utils.sh /ci_commit_pins/jax.txt
-
 ARG ONNX
 # Install ONNX dependencies
 COPY ./common/install_onnx.sh ./common/common_utils.sh ./
--- a/.ci/lumen_cli/cli/lib/common/cli_helper.py
+++ b/.ci/lumen_cli/cli/lib/common/cli_helper.py
@ -8,11 +8,9 @@ from abc import ABC, abstractmethod


 try:
-    from collections.abc import Callable  # Python 3.11+
-    from typing import Any, Required, TypedDict
+    from typing import Any, Callable, Required, TypedDict  # Python 3.11+
 except ImportError:
-    from collections.abc import Callable
-    from typing import Any, TypedDict
+    from typing import Any, Callable, TypedDict

    from typing_extensions import Required  # Fallback for Python <3.11

--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -168,16 +168,14 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/compiler/latest/env/vars.sh
  # shellcheck disable=SC1091
-  source /opt/intel/oneapi/umf/latest/env/vars.sh
-  # shellcheck disable=SC1091
  source /opt/intel/oneapi/ccl/latest/env/vars.sh
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/mpi/latest/env/vars.sh
-  # shellcheck disable=SC1091
-  source /opt/intel/oneapi/pti/latest/env/vars.sh
  # Enable XCCL build
  export USE_XCCL=1
  export USE_MPI=0
+  # XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA
+  export USE_KINETO=0
  export TORCH_XPU_ARCH_LIST=pvc
 fi

--- a/.ci/pytorch/python_doc_push_script.sh
+++ b/.ci/pytorch/python_doc_push_script.sh
@ -89,41 +89,23 @@ if [ "$is_main_doc" = true ]; then

  make coverage
  # Now we have the coverage report, we need to make sure it is empty.
-  # Sphinx 7.2.6+ format: python.txt contains a statistics table with a TOTAL row
-  # showing the undocumented count in the third column.
-  # Example: | TOTAL | 99.83% | 2 |
+  # Count the number of lines in the file and turn that number into a variable
+  # $lines. The `cut -f1 ...` is to only parse the number, not the filename
+  # Skip the report header by subtracting 2: the header will be output even if
+  # there are no undocumented items.
  #
  # Also: see docs/source/conf.py for "coverage_ignore*" items, which should
  # be documented then removed from there.
-
-  # Extract undocumented count from TOTAL row in Sphinx 7.2.6 statistics table
-  # The table format is: | Module | Coverage | Undocumented |
-  # Extract the third column (undocumented count) from the TOTAL row
-  undocumented=$(grep "| TOTAL" build/coverage/python.txt | awk -F'|' '{print $4}' | tr -d ' ')
-
-  if [ -z "$undocumented" ] || ! [[ "$undocumented" =~ ^[0-9]+$ ]]; then
+  lines=$(wc -l build/coverage/python.txt 2>/dev/null |cut -f1 -d' ')
+  undocumented=$((lines - 2))
+  if [ $undocumented -lt 0 ]; then
    echo coverage output not found
    exit 1
-  elif [ "$undocumented" -gt 0 ]; then
-    set +x  # Disable command echoing for cleaner output
-    echo ""
-    echo "====================="
-    echo "UNDOCUMENTED OBJECTS:"
-    echo "====================="
-    echo ""
-    # Find the line number of the TOTAL row and print only what comes after it
-    total_line=$(grep -n "| TOTAL" build/coverage/python.txt | cut -d: -f1)
-    if [ -n "$total_line" ]; then
-      # Print only the detailed list (skip the statistics table)
-      tail -n +$((total_line + 2)) build/coverage/python.txt
-    else
-      # Fallback to showing entire file if TOTAL line not found
-      cat build/coverage/python.txt
-    fi
-    echo ""
+  elif [ $undocumented -gt 0 ]; then
+    echo undocumented objects found:
+    cat build/coverage/python.txt
    echo "Make sure you've updated relevant .rsts in docs/source!"
-    echo "You can reproduce locally by running 'cd docs && make coverage && tail -n +\$((grep -n \"| TOTAL\" build/coverage/python.txt | cut -d: -f1) + 2)) build/coverage/python.txt'"
-    set -x  # Re-enable command echoing
+    echo "You can reproduce locally by running 'cd docs && make coverage && cat build/coverage/python.txt'"
    exit 1
  fi
 else
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -208,8 +208,6 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  source /opt/intel/oneapi/ccl/latest/env/vars.sh
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/mpi/latest/env/vars.sh
-  # shellcheck disable=SC1091
-  source /opt/intel/oneapi/pti/latest/env/vars.sh
  # Check XPU status before testing
  timeout 30 xpu-smi discovery || true
 fi
@ -339,7 +337,7 @@ test_python() {

 test_python_smoke() {
  # Smoke tests for H100/B200
-  time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune inductor/test_cutedsl_grouped_mm $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  assert_git_not_dirty
 }

@ -826,11 +824,6 @@ test_inductor_halide() {
  assert_git_not_dirty
 }

-test_inductor_pallas() {
-  python test/run_test.py --include inductor/test_pallas.py --verbose
-  assert_git_not_dirty
-}
-
 test_inductor_triton_cpu() {
  python test/run_test.py --include inductor/test_triton_cpu_backend.py inductor/test_torchinductor_strided_blocks.py --verbose
  assert_git_not_dirty
@ -1731,8 +1724,6 @@ elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
  test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
  test_inductor_halide
-elif [[ "${TEST_CONFIG}" == *inductor-pallas* ]]; then
-  test_inductor_pallas
 elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then
  test_inductor_triton_cpu
 elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
--- a/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1
+++ b/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1
@ -70,7 +70,7 @@ sccache --zero-stats
 sccache --show-stats

 # Build the wheel
-python -m build --wheel --no-isolation
+python -m build --wheel --no-build-isolation
 if ($LASTEXITCODE -ne 0) { exit 1 }

 # Install the wheel locally
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@ -1 +1 @@
-ccb801b88af136454798b945175c4c87e636ac33
+cfbc5c2f1c798991715a6b06bb3ce46478c4487c
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-e4d25697f9dc5eedaf8f0a5bf085c62c5455a53a
+c8b09f5f77d6bf6fb7ed7a9aa83e5d8156b3a5e9
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -138,8 +138,7 @@
 - test/test_matmul_cuda.py
 - test/test_scaled_matmul_cuda.py
 - test/inductor/test_fp8.py
- aten/src/ATen/native/cuda/*Blas.cpp
- aten/src/ATen/cuda/CUDA*Blas.*
+- aten/src/ATen/native/cuda/Blas.cpp
 - torch/**/*cublas*
 - torch/_inductor/kernel/mm.py
 - test/inductor/test_max_autotune.py
@ -149,8 +148,7 @@
 - test/test_matmul_cuda.py
 - test/test_scaled_matmul_cuda.py
 - test/inductor/test_fp8.py
- aten/src/ATen/native/cuda/*Blas.cpp
- aten/src/ATen/cuda/CUDA*Blas.*
+- aten/src/ATen/native/cuda/Blas.cpp
 - torch/**/*cublas*
 - torch/_inductor/kernel/mm.py
 - test/inductor/test_max_autotune.py
@ -160,8 +158,7 @@
 - test/test_matmul_cuda.py
 - test/test_scaled_matmul_cuda.py
 - test/inductor/test_fp8.py
- aten/src/ATen/native/cuda/*Blas.cpp
- aten/src/ATen/cuda/CUDA*Blas.*
+- aten/src/ATen/native/cuda/Blas.cpp
 - torch/_inductor/kernel/mm.py
 - test/inductor/test_max_autotune.py
 - third_party/fbgemm
--- a/.github/nitpicks.yml
+++ b/.github/nitpicks.yml
@ -10,4 +10,3 @@
  pathFilter:
    - 'torch/csrc/inductor/aoti_torch/c/*'
    - 'torch/csrc/inductor/aoti_torch/generated/*'
-    - 'torch/csrc/stable/c/*'
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -2,8 +2,8 @@ tracking_issue: 24422
 ciflow_tracking_issue: 64124
 ciflow_push_tags:
 - ciflow/b200
- ciflow/b200-distributed
 - ciflow/b200-symm-mem
+- ciflow/b200-distributed
 - ciflow/binaries
 - ciflow/binaries_libtorch
 - ciflow/binaries_wheel
@ -22,8 +22,6 @@ ciflow_push_tags:
 - ciflow/inductor-perf-test-nightly-xpu
 - ciflow/inductor-periodic
 - ciflow/inductor-rocm
- ciflow/inductor-rocm-mi200
- ciflow/inductor-rocm-mi300
 - ciflow/linux-aarch64
 - ciflow/mps
 - ciflow/nightly
@ -35,13 +33,11 @@ ciflow_push_tags:
 - ciflow/quantization-periodic
 - ciflow/riscv64
 - ciflow/rocm
- ciflow/rocm-mi200
 - ciflow/rocm-mi300
 - ciflow/rocm-mi355
 - ciflow/rocm-navi31
 - ciflow/s390
 - ciflow/slow
- ciflow/slow-rocm-mi200
 - ciflow/torchbench
 - ciflow/triton_binaries
 - ciflow/trunk
--- a/.github/scripts/delete_old_branches.py
+++ b/.github/scripts/delete_old_branches.py
@ -1,11 +1,10 @@
 # Delete old branches
 import os
 import re
-from collections.abc import Callable
 from datetime import datetime
 from functools import lru_cache
 from pathlib import Path
-from typing import Any
+from typing import Any, Callable

 from github_utils import gh_fetch_json_dict, gh_graphql
 from gitutils import GitRepo
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@ -8,11 +8,10 @@ import re
 import subprocess
 import sys
 import warnings
-from collections.abc import Callable
 from enum import Enum
 from functools import cache
 from logging import info
-from typing import Any, Optional
+from typing import Any, Callable, Optional
 from urllib.request import Request, urlopen

 import yaml
--- a/.github/scripts/get_workflow_job_id.py
+++ b/.github/scripts/get_workflow_job_id.py
@ -11,8 +11,7 @@ import sys
 import time
 import urllib
 import urllib.parse
-from collections.abc import Callable
-from typing import Any, Optional
+from typing import Any, Callable, Optional
 from urllib.request import Request, urlopen


--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@ -3,9 +3,8 @@
 import json
 import os
 import warnings
-from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Any, cast, Optional, Union
+from typing import Any, Callable, cast, Optional, Union
 from urllib.error import HTTPError
 from urllib.parse import quote
 from urllib.request import Request, urlopen
--- a/.github/scripts/gitutils.py
+++ b/.github/scripts/gitutils.py
@ -4,10 +4,10 @@ import os
 import re
 import tempfile
 from collections import defaultdict
-from collections.abc import Callable, Iterator
+from collections.abc import Iterator
 from datetime import datetime
 from functools import wraps
-from typing import Any, cast, Optional, TypeVar, Union
+from typing import Any, Callable, cast, Optional, TypeVar, Union


 T = TypeVar("T")
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -17,12 +17,12 @@ import re
 import time
 import urllib.parse
 from collections import defaultdict
-from collections.abc import Callable, Iterable
+from collections.abc import Iterable
 from dataclasses import dataclass
 from functools import cache
 from pathlib import Path
 from re import Pattern
-from typing import Any, cast, NamedTuple, Optional
+from typing import Any, Callable, cast, NamedTuple, Optional
 from warnings import warn

 import yaml
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -56,8 +56,6 @@ jobs:
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
          pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
          pytorch-linux-jammy-py3.10-clang12,
-          pytorch-linux-jammy-py3.11-clang12,
-          pytorch-linux-jammy-py3.12-clang12,
          pytorch-linux-jammy-py3.13-clang12,
          pytorch-linux-jammy-py3.14-clang12,
          pytorch-linux-jammy-rocm-n-py3,
@ -67,10 +65,9 @@ jobs:
          pytorch-linux-jammy-py3.10-gcc11,
          pytorch-linux-jammy-py3-gcc11-inductor-benchmarks,
          pytorch-linux-jammy-py3.12-halide,
-          pytorch-linux-jammy-cuda12.8-py3.12-pallas,
          pytorch-linux-jammy-xpu-n-1-py3,
-          pytorch-linux-noble-xpu-n-py3,
-          pytorch-linux-noble-xpu-n-py3-inductor-benchmarks,
+          pytorch-linux-jammy-xpu-n-py3,
+          pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks,
          pytorch-linux-jammy-py3-clang18-asan,
          pytorch-linux-jammy-py3-clang12-onnx,
          pytorch-linux-jammy-linter,
--- a/.github/workflows/inductor-perf-test-nightly-xpu.yml
+++ b/.github/workflows/inductor-perf-test-nightly-xpu.yml
@ -83,8 +83,8 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-noble-xpu-n-py3.10
-      docker-image-name: ci-image:pytorch-linux-noble-xpu-n-py3-inductor-benchmarks
+      build-environment: linux-jammy-xpu-n-py3.10
+      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks
      runner: linux.c7i.12xlarge
      test-matrix: |
        { include: [
@ -117,7 +117,7 @@ jobs:
    uses: ./.github/workflows/_xpu-test.yml
    needs: xpu-n-py3_10-inductor-benchmark-build
    with:
-      build-environment: linux-noble-xpu-n-py3.10
+      build-environment: linux-jammy-xpu-n-py3.10
      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-false-cppwrapper-true-aotinductor-true-freezing_cudagraphs-false-cudagraphs_low_precision-false
      docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
      test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
@ -137,7 +137,7 @@ jobs:
    uses: ./.github/workflows/_xpu-test.yml
    needs: xpu-n-py3_10-inductor-benchmark-build
    with:
-      build-environment: linux-noble-xpu-n-py3.10
+      build-environment: linux-jammy-xpu-n-py3.10
      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
      docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
      test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
--- a/.github/workflows/inductor-rocm-mi300.yml
+++ b/.github/workflows/inductor-rocm-mi300.yml
@ -7,7 +7,6 @@ on:
      - release/*
    tags:
      - ciflow/inductor-rocm/*
-      - ciflow/inductor-rocm-mi300/*
  workflow_dispatch:

 concurrency:
--- a/.github/workflows/inductor-rocm-mi200.yml
+++ b/.github/workflows/inductor-rocm-mi200.yml
@ -2,12 +2,12 @@ name: inductor-rocm

 on:
  schedule:
-    - cron: 0 */3 * * *
+    - cron: 0 * * * *
  push:
    branches:
      - release/*
    tags:
-      - ciflow/inductor-rocm-mi200/*
+      - ciflow/inductor-rocm/*
  workflow_dispatch:

 concurrency:
--- a/.github/workflows/inductor-unittest.yml
+++ b/.github/workflows/inductor-unittest.yml
@ -81,32 +81,6 @@ jobs:
      test-matrix: ${{ needs.inductor-halide-build.outputs.test-matrix }}
    secrets: inherit

-  inductor-pallas-build:
-    name: inductor-pallas-build
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.12-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-py3.12-pallas
-      cuda-arch-list: '8.9'
-      runner: linux.8xlarge.memory
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      test-matrix: |
-        { include: [
-          { config: "inductor-pallas", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu" },
-        ]}
-    secrets: inherit
-
-  inductor-pallas-test:
-    name: inductor-pallas-test
-    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-pallas-build
-    with:
-      build-environment: linux-jammy-py3.12-gcc11
-      docker-image: ${{ needs.inductor-pallas-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-pallas-build.outputs.test-matrix }}
-    secrets: inherit
-
  inductor-triton-cpu-build:
    name: inductor-triton-cpu-build
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/periodic-rocm-mi200.yml
+++ b/.github/workflows/periodic-rocm-mi200.yml
@ -11,6 +11,7 @@ on:
    - cron: 29 8 * * *  # about 1:29am PDT, for mem leak check and rerun disabled tests
  push:
    tags:
+      - ciflow/periodic/*
      - ciflow/periodic-rocm-mi200/*
    branches:
      - release/*
--- a/.github/workflows/periodic-rocm-mi300.yml
+++ b/.github/workflows/periodic-rocm-mi300.yml
@ -11,7 +11,6 @@ on:
    - cron: 29 8 * * *  # about 1:29am PDT, for mem leak check and rerun disabled tests
  push:
    tags:
-      - ciflow/periodic/*
      - ciflow/periodic-rocm-mi300/*
    branches:
      - release/*
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -342,16 +342,16 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
    secrets: inherit

-  linux-noble-xpu-n-py3_10-build:
-    name: linux-noble-xpu-n-py3.10
+  linux-jammy-xpu-n-py3_10-build:
+    name: linux-jammy-xpu-n-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      # This should sync with the build in xpu.yml but xpu uses a larger runner
      # sync-tag: linux-xpu-n-build
      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
-      build-environment: linux-noble-xpu-n-py3.10
-      docker-image-name: ci-image:pytorch-linux-noble-xpu-n-py3
+      build-environment: linux-jammy-xpu-n-py3.10
+      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 4, runner: "linux.idc.xpu" },
--- a/.github/workflows/rocm-mi300.yml
+++ b/.github/workflows/rocm-mi300.yml
@ -6,7 +6,6 @@ on:
      - main
      - release/*
    tags:
-      - ciflow/rocm/*
      - ciflow/rocm-mi300/*
  workflow_dispatch:
  schedule:
--- a/.github/workflows/rocm-mi200.yml
+++ b/.github/workflows/rocm-mi200.yml
@ -5,12 +5,11 @@ on:
    branches:
      - release/*
    tags:
-      - ciflow/rocm-mi200/*
+      - ciflow/rocm/*
  workflow_dispatch:
  schedule:
    - cron: 29 8 * * *  # about 1:29am PDT
-    - cron: 0 */3 * * *
-
+    - cron: 0 * * * *

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
--- a/.github/workflows/slow-rocm-mi200.yml
+++ b/.github/workflows/slow-rocm-mi200.yml
@ -1,81 +0,0 @@
-# This workflow is dedicated to host slow jobs that are run only periodically because
-# they are too slow to run in every commit.  The list of slow tests can be found in
-# https://github.com/pytorch/test-infra/blob/generated-stats/stats/slow-tests.json
-name: slow-rocm-mi200
-
-on:
-  push:
-    branches:
-      - release/*
-    tags:
-      - ciflow/slow/*
-      - ciflow/slow-rocm-mi200/*
-  schedule:
-    - cron: 0 */3 * * *
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
-  cancel-in-progress: true
-
-permissions:
-  id-token: write
-  contents: read
-
-jobs:
-  llm-td:
-    if: github.repository_owner == 'pytorch'
-    name: before-test
-    uses: ./.github/workflows/llm_td_retrieval.yml
-    permissions:
-      id-token: write
-      contents: read
-
-  target-determination:
-    name: before-test
-    uses: ./.github/workflows/target_determination.yml
-    needs: llm-td
-    permissions:
-      id-token: write
-      contents: read
-
-  get-label-type:
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-
-  linux-jammy-rocm-py3_10-build:
-    name: linux-jammy-rocm-py3.10
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-rocm-py3.10
-      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
-      sync-tag: rocm-build
-      test-matrix: |
-        { include: [
-          { config: "slow", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
-          { config: "slow", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
-        ]}
-    secrets: inherit
-
-  linux-jammy-rocm-py3_10-test:
-    permissions:
-      id-token: write
-      contents: read
-    name: linux-jammy-rocm-py3.10
-    uses: ./.github/workflows/_rocm-test.yml
-    needs:
-      - linux-jammy-rocm-py3_10-build
-      - target-determination
-    with:
-      build-environment: linux-jammy-rocm-py3.10
-      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
-    secrets: inherit
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -105,6 +105,36 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }}
    secrets: inherit

+  linux-jammy-rocm-py3_10-build:
+    name: linux-jammy-rocm-py3.10
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+      test-matrix: |
+        { include: [
+          { config: "slow", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
+          { config: "slow", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
+        ]}
+    secrets: inherit
+
+  linux-jammy-rocm-py3_10-test:
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-jammy-rocm-py3.10
+    uses: ./.github/workflows/_rocm-test.yml
+    needs:
+      - linux-jammy-rocm-py3_10-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
+    secrets: inherit
+
  linux-jammy-py3_10-clang18-asan-build:
    name: linux-jammy-py3.10-clang18-asan
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@ -11,16 +11,15 @@ on:
      - inductor
      - unstable
      - slow
-      - slow-rocm-mi200
      - unstable-periodic
      - inductor-periodic
-      - rocm-mi200
+      - rocm
      - rocm-mi300
      - rocm-mi355
      - inductor-micro-benchmark
      - inductor-micro-benchmark-x86
      - inductor-cu124
-      - inductor-rocm-mi200
+      - inductor-rocm
      - inductor-rocm-mi300
      - mac-mps
      - linux-aarch64
--- a/.github/workflows/xpu.yml
+++ b/.github/workflows/xpu.yml
@ -47,15 +47,15 @@ jobs:
        ]}
    secrets: inherit

-  linux-noble-xpu-n-py3_10-build:
-    name: linux-noble-xpu-n-py3.10
+  linux-jammy-xpu-n-py3_10-build:
+    name: linux-jammy-xpu-n-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      sync-tag: linux-xpu-n-build
      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
-      build-environment: linux-noble-xpu-n-py3.10
-      docker-image-name: ci-image:pytorch-linux-noble-xpu-n-py3
+      build-environment: linux-jammy-xpu-n-py3.10
+      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
      runner: linux.c7i.12xlarge
      test-matrix: |
        { include: [
@ -74,17 +74,17 @@ jobs:
        ]}
    secrets: inherit

-  linux-noble-xpu-n-py3_10-test:
-    name: linux-noble-xpu-n-py3.10
+  linux-jammy-xpu-n-py3_10-test:
+    name: linux-jammy-xpu-n-py3.10
    uses: ./.github/workflows/_xpu-test.yml
-    needs: linux-noble-xpu-n-py3_10-build
+    needs: linux-jammy-xpu-n-py3_10-build
    permissions:
      id-token: write
      contents: read
    with:
-      build-environment: linux-noble-xpu-n-py3.10
-      docker-image: ${{ needs.linux-noble-xpu-n-py3_10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-noble-xpu-n-py3_10-build.outputs.test-matrix }}
+      build-environment: linux-jammy-xpu-n-py3.10
+      docker-image: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.test-matrix }}
    secrets: inherit

  windows-xpu-n-1-build:
--- a/.gitignore
+++ b/.gitignore
@ -127,7 +127,6 @@ torch/test/
 torch/utils/benchmark/utils/valgrind_wrapper/callgrind.h
 torch/utils/benchmark/utils/valgrind_wrapper/valgrind.h
 torch/version.py
-torch/_inductor/kernel/vendored_templates/*
 minifier_launcher.py
 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_fwd_d*
 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd_d*
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -143,8 +143,7 @@ init_command = [
    'tools/linter/adapters/pip_init.py',
    '--dry-run={{DRYRUN}}',
    'numpy==1.26.4 ; python_version >= "3.10" and python_version <= "3.11"',
-    'numpy==2.1.0 ; python_version >= "3.12" and python_version <= "3.13"',
-    'numpy==2.3.4 ; python_version >= "3.14"',
+    'numpy==2.1.0 ; python_version >= "3.12"',
    'expecttest==0.3.0',
    'pyrefly==0.36.2',
    'sympy==1.13.3',
@ -1402,7 +1401,7 @@ init_command = [
    '--dry-run={{DRYRUN}}',
    'usort==1.0.8.post1',
    'isort==6.0.1',
-    'ruff==0.14.4',  # sync with RUFF
+    'ruff==0.13.1',  # sync with RUFF
 ]
 is_formatter = true

@ -1537,7 +1536,7 @@ init_command = [
    'python3',
    'tools/linter/adapters/pip_init.py',
    '--dry-run={{DRYRUN}}',
-    'ruff==0.14.4',  # sync with PYFMT
+    'ruff==0.13.1',  # sync with PYFMT
 ]
 is_formatter = true

--- a/6
+++ b/6
@ -210,12 +210,8 @@ torch/backends/cudnn/ @eqy @syed-ahmed @Aidyn-A
 /test/inductor/test_flex_attention.py @drisspg
 /test/inductor/test_flex_decoding.py @drisspg

-# Low Precision & Grouped GEMMs
+# Low Precision GEMMs
 /aten/src/ATen/native/cuda/Blas.cpp @drisspg @slayton58
-/aten/src/ATen/native/cuda/GroupedBlas.cpp @drisspg @slayton58
-/aten/src/ATen/native/cuda/ScaledBlas.cpp @drisspg @slayton58
 /aten/src/ATen/cuda/CUDABlas.cpp @drisspg @slayton58
 /aten/src/ATen/cuda/CUDABlas.h @drisspg @slayton58
-/aten/src/ATen/cuda/CUDAScaledBlas.cpp @drisspg @slayton58
-/aten/src/ATen/cuda/CUDAScaledBlas.h @drisspg @slayton58
 /test/test_scaled_matmul_cuda.py @drisspg @slayton58
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -174,12 +174,6 @@ class TORCH_API Context {
  static long versionCuDNN() {
    return detail::getCUDAHooks().versionCuDNN();
  }
-  static long versionRuntimeCuDNN() {
-    return detail::getCUDAHooks().versionRuntimeCuDNN();
-  }
-  static long versionCuDNNFrontend() {
-    return detail::getCUDAHooks().versionCuDNNFrontend();
-  }
  static bool hasCuSOLVER() {
    return detail::getCUDAHooks().hasCuSOLVER();
  }
--- a/aten/src/ATen/DeviceAccelerator.h
+++ b/aten/src/ATen/DeviceAccelerator.h
@ -94,11 +94,6 @@ TORCH_API inline void resetPeakStats(c10::DeviceIndex device_index) {
  at::getDeviceAllocator(device_type)->resetPeakStats(device_index);
 }

-TORCH_API inline std::pair<size_t, size_t> getMemoryInfo(
-    c10::DeviceIndex device_index) {
-  const auto device_type = getAccelerator(true).value();
-  return at::getDeviceAllocator(device_type)->getMemoryInfo(device_index);
-}
 } // namespace at::accelerator

 namespace at {
--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@ -6,7 +6,6 @@
 #include <c10/util/Half.h>
 #include <c10/util/Metaprogramming.h>
 #include <c10/util/complex.h>
-#include <torch/headeronly/core/Dispatch.h>

 #ifdef __CUDACC__
 #include <cuda.h> // For CUDA_VERSION
@ -62,9 +61,12 @@ TORCH_API void record_kernel_function_dtype(std::string name);
    }                                                 \
  } while (0)

-#define AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, HINT, ...) \
-  THO_PRIVATE_CASE_TYPE_USING_HINT_TMPL(                      \
-      AT_PRIVATE_CHECK_SELECTIVE_BUILD, enum_type, HINT, __VA_ARGS__)
+#define AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, HINT, ...)                 \
+  case enum_type: {                                                           \
+    AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type);                              \
+    using HINT [[maybe_unused]] = c10::impl::ScalarTypeToCPPTypeT<enum_type>; \
+    return __VA_ARGS__();                                                     \
+  }

 #define AT_DISPATCH_CASE(enum_type, ...) \
  AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, scalar_t, __VA_ARGS__)
@ -93,6 +95,14 @@ TORCH_API void record_kernel_function_dtype(std::string name);
    return __VA_ARGS__();                                                   \
  }

+namespace detail {
+
+inline at::ScalarType scalar_type(at::ScalarType s) {
+  return s;
+}
+
+} // namespace detail
+
 // The AT_DISPATCH_* family of macros provides the ability to
 // conveniently generate specializations of a kernel over all of the
 // dtypes we care about in PyTorch.  We call it "dispatch" because
@ -180,13 +190,27 @@ TORCH_API void record_kernel_function_dtype(std::string name);
 // but we're just being safe (and it doesn't hurt.)  Note we must
 // use it to shut up warnings about unused store.

-#define AT_DISPATCH_SWITCH(TYPE, NAME, ...) \
-  THO_DISPATCH_SWITCH_TMPL(                 \
-      RECORD_KERNEL_FUNCTION_DTYPE,         \
-      TORCH_CHECK_NOT_IMPLEMENTED,          \
-      TYPE,                                 \
-      NAME,                                 \
-      __VA_ARGS__)
+#define AT_DISPATCH_SWITCH(TYPE, NAME, ...)                                 \
+  [&] {                                                                     \
+    const auto& the_type = TYPE;                                            \
+    constexpr const char* at_dispatch_name = NAME;                          \
+    /* don't use TYPE again in case it is an expensive or side-effect op */ \
+    at::ScalarType _st = ::detail::scalar_type(the_type);                   \
+    RECORD_KERNEL_FUNCTION_DTYPE(at_dispatch_name, _st);                    \
+    C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum")             \
+    switch (_st) {                                                          \
+      __VA_ARGS__                                                           \
+      default:                                                              \
+        TORCH_CHECK_NOT_IMPLEMENTED(                                        \
+            false,                                                          \
+            '"',                                                            \
+            at_dispatch_name,                                               \
+            "\" not implemented for '",                                     \
+            toString(_st),                                                  \
+            "'");                                                           \
+    }                                                                       \
+    C10_DIAGNOSTIC_POP()                                                    \
+  }()

 #define AT_DISPATCH_CASE_FLOATING_TYPES(...)            \
  AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \
--- a/aten/src/ATen/Dispatch_v2.h
+++ b/aten/src/ATen/Dispatch_v2.h
@ -1,8 +1,3 @@
-#pragma once
-
-#include <torch/headeronly/core/Dispatch_v2.h>
-
-// Get AT_DISPATCH_SWITCH and AT_DISPATCH_CASE:
 #include <ATen/Dispatch.h>

 // This is a new implementation of the AT_DISPATCH macro family from
@ -79,19 +74,41 @@
 // macro expansion occurs, mediated with AT_EXPAND and AT_GUARD.  I mostly
 // relied on GPT4 to help me get it right.

+// Public API macros
+
 // See documentation above
 #define AT_DISPATCH_V2(TYPE, NAME, BODY, ...) \
-  THO_DISPATCH_V2_TMPL(                       \
-      AT_DISPATCH_SWITCH,                     \
-      AT_DISPATCH_CASE,                       \
-      TYPE,                                   \
-      NAME,                                   \
-      AT_WRAP(BODY),                          \
-      __VA_ARGS__)
+  AT_DISPATCH_SWITCH(TYPE, NAME, AT_AP_VAR(AT_WRAP(BODY), TYPE, __VA_ARGS__))
+
+// This macro lets you pass an arbitrary expression that may contain internal
+// commas to another macro without having the commas causing the expression
+// to be interpreted as being multiple arguments
+#define AT_WRAP(...) __VA_ARGS__
+
+#define AT_FLOAT8_TYPES                                          \
+  c10::kFloat8_e5m2, c10::kFloat8_e5m2fnuz, c10::kFloat8_e4m3fn, \
+      c10::kFloat8_e4m3fnuz, c10::kFloat8_e8m0fnu
+
+#define AT_INTEGRAL_TYPES \
+  c10::kByte, c10::kChar, c10::kInt, c10::kLong, c10::kShort
+#define AT_FLOATING_TYPES c10::kDouble, c10::kFloat
+#define AT_BAREBONES_UNSIGNED_TYPES c10::kUInt16, c10::kUInt32, c10::kUInt64
+#define AT_INTEGRAL_TYPES_V2 \
+  AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)
+#define AT_COMPLEX_TYPES c10::kComplexDouble, c10::kComplexFloat
+#define AT_QINT_TYPES c10::kQInt8, c10::kQUInt8, c10::kQInt32
+// NB: not *actually* all types
+#define AT_ALL_TYPES AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_FLOATING_TYPES)
+#define AT_ALL_TYPES_AND_COMPLEX \
+  AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_COMPLEX_TYPES)
+
+// Helper macros

-// Unused helper macros, kept for BC:
 #define AT_AP_VAR(N, T, ...) \
  AT_EXPAND(AT_CONCAT(AT_AP, AT_NUM_ARGS(__VA_ARGS__))(AT_WRAP(N), __VA_ARGS__))
+#define AT_CONCAT(a, b) AT_CONCAT_AUX(a, b)
+#define AT_CONCAT_AUX(a, b) a##b
+#define AT_EXPAND(X) X

 // Ensure we never have too many scalar types for the expansion here to
 // support.  To bump this, you must regenerate the macros below.
@ -102,6 +119,12 @@ static_assert(static_cast<int>(c10::ScalarType::NumOptions) < 60);

 num_args = 60

+nums = ', '.join(str(i) for i in reversed(range(num_args+1)))
+args = ', '.join(f'_{i}' for i in range(1, num_args+1))
+
+print(f'#define AT_NUM_ARGS(...) AT_EXPAND(AT_NUM_ARGS_AUX(__VA_ARGS__, {nums}))')
+print(f'#define AT_NUM_ARGS_AUX({args}, N, ...) N')
+
 for i in range(1, num_args+1):
    args = ', '.join(f'_{i}' for i in range(1, i+1))
    cases = ' '.join([f'AT_DISPATCH_CASE(_{j}, N)' for j in range(1, i+1)])
@ -112,6 +135,8 @@ for i in range(1, num_args+1):
 // Begin generated code
 // clang-format off

+#define AT_NUM_ARGS(...) AT_EXPAND(AT_NUM_ARGS_AUX(__VA_ARGS__, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0))
+#define AT_NUM_ARGS_AUX(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58, _59, _60, N, ...) N
 #define AT_AP1(N, _1) AT_DISPATCH_CASE(_1, N)
 #define AT_AP2(N, _1, _2) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N)
 #define AT_AP3(N, _1, _2, _3) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N)
--- a/aten/src/ATen/core/CachingHostAllocator.h
+++ b/aten/src/ATen/core/CachingHostAllocator.h
@ -226,8 +226,8 @@ template <
    typename B = HostBlock<S>>
 struct CachingHostAllocatorImpl {
  virtual ~CachingHostAllocatorImpl() {
-    if (active_) {
-      active_ = false;
+    active_ = false;
+    if (pinned_use_background_threads()) {
      getBackgroundThreadPool()->waitWorkComplete();
    }
  }
@ -260,7 +260,6 @@ struct CachingHostAllocatorImpl {
    if (pinned_use_background_threads()) {
      // Launch the background thread and process events in a loop.
      static bool background_thread_flag [[maybe_unused]] = [this] {
-        active_ = true;
        getBackgroundThreadPool()->run([&]() {
          while (active_) {
            process_events();
@ -684,9 +683,9 @@ struct CachingHostAllocatorImpl {
  alignas(hardware_destructive_interference_size) std::mutex events_mutex_;
  std::deque<std::pair<E, B*>> events_; // event queue paired with block

-  // Indicates whether the event-processing thread pool is active.
+  // Indicates whether the object is active.
  // Set to false in the destructor to signal background threads to stop.
-  std::atomic<bool> active_{false};
+  std::atomic<bool> active_{true};
 protected:
  alignas(hardware_destructive_interference_size) HostStatsStaged stats_;
 };
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -1597,7 +1597,7 @@ bool gemm_and_bias(
  }

  using opmath_t = at::opmath_type<Dtype>;
-  opmath_t beta_val = bias ? 0 : 1; // bias is added in epilogue unless nullptr
+  opmath_t beta_val = 0; // bias is added in epilogue

  cudaDataType_t abType = CUDA_R_32F;
  cudaDataType_t cType = CUDA_R_32F;
@ -1686,22 +1686,15 @@ bool gemm_and_bias(
    _syncCurrentWithCarveoutStream(stream, true);
  }
 #endif
-  const auto epilogue = [&]() -> cublasLtEpilogue_t {
-    // The cuBLAS documentation indicates that
-    // *_<ACTIVATION>_BIAS = *_<ACTIVATION>,
-    // but we keep it verbose here for clarity.
-    switch (activation) {
-      case GEMMAndBiasActivationEpilogue::RELU:
-        return bias ? CUBLASLT_EPILOGUE_RELU_BIAS : CUBLASLT_EPILOGUE_RELU;
-      case GEMMAndBiasActivationEpilogue::GELU:
-        return bias ? CUBLASLT_EPILOGUE_GELU_BIAS : CUBLASLT_EPILOGUE_GELU;
-      default:
-        return bias ? CUBLASLT_EPILOGUE_BIAS : CUBLASLT_EPILOGUE_DEFAULT;
-    }
-  }();
-  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, epilogue);
+  cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_BIAS;
+  if (activation == GEMMAndBiasActivationEpilogue::RELU) {
+    epilogue = CUBLASLT_EPILOGUE_RELU_BIAS;
+  } else if (activation == GEMMAndBiasActivationEpilogue::GELU) {
+    epilogue = CUBLASLT_EPILOGUE_GELU_BIAS;
+  }

-  if (bias) {
+  if (bias != nullptr) {
+    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, epilogue);
    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_POINTER, bias);
  }

--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@ -21,7 +21,6 @@

 #if AT_CUDNN_ENABLED()
 #include <ATen/cudnn/cudnn-wrapper.h>
-#include <cudnn_frontend.h>
 #endif

 #if AT_MAGMA_ENABLED()
@ -352,26 +351,6 @@ long CUDAHooks::versionCuDNN() const {
 #endif
 }

-long CUDAHooks::versionRuntimeCuDNN() const {
-#if AT_CUDNN_ENABLED()
-#ifndef USE_STATIC_CUDNN
-  return cudnnGetVersion();
-#else
-  return CUDNN_VERSION;
-#endif
-#else
-  TORCH_CHECK(false, "Cannot query CuDNN version if ATen_cuda is not built with CuDNN");
-#endif
-}
-
-long CUDAHooks::versionCuDNNFrontend() const {
-#if AT_CUDNN_ENABLED()
-  return CUDNN_FRONTEND_VERSION;
-#else
-  TORCH_CHECK(false, "Cannot query CuDNN Frontend version if ATen_cuda is not built with CuDNN");
-#endif
-}
-
 long CUDAHooks::versionMIOpen() const {
 #if AT_ROCM_ENABLED()
  return MIOPEN_VERSION_MAJOR * 10000 +
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@ -49,8 +49,6 @@ struct CUDAHooks : public at::CUDAHooksInterface {
  bool hasCUDART() const override;
  long versionCUDART() const override;
  long versionCuDNN() const override;
-  long versionRuntimeCuDNN() const override;
-  long versionCuDNNFrontend() const override;
  long versionMIOpen() const override;
  std::string showConfig() const override;
  double batchnormMinEpsilonCuDNN() const override;
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@ -174,14 +174,6 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
    TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
  }

-  virtual long versionRuntimeCuDNN() const {
-    TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
-  }
-
-  virtual long versionCuDNNFrontend() const {
-    TORCH_CHECK(false, "Cannot query cuDNN Frontend version without ATen_cuda library. ", CUDA_HELP);
-  }
-
  virtual long versionMIOpen() const {
    TORCH_CHECK(false, "Cannot query MIOpen version without ATen_cuda library. ", CUDA_HELP);
  }
--- a/aten/src/ATen/functorch/BatchedTensorImpl.h
+++ b/aten/src/ATen/functorch/BatchedTensorImpl.h
@ -157,8 +157,6 @@ constexpr DispatchKeySet kKeysToPropagateToWrapper({
  DispatchKey::Negative,
  DispatchKey::Conjugate,
  DispatchKey::XLA,
-  DispatchKey::XPU,
-  DispatchKey::HPU,
  DispatchKey::CUDA,
  DispatchKey::CPU,
  DispatchKey::PrivateUse1,
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -409,7 +409,7 @@ struct ConvParams {
    if (!detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda() || !cudnn_enabled) {
      return false;
    }
-    static long cudnn_version = detail::getCUDAHooks().versionRuntimeCuDNN();
+    static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
    // broken on cuDNN 9.8 - 9.14
    if (cudnn_version >= 90800 && cudnn_version < 91500) {
      if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous &&
@ -453,7 +453,7 @@ struct ConvParams {
    }
    // native kernel doesn't support 64-bit non-splittable case
    if (!(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) {
-      static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionRuntimeCuDNN() : -1;
+      static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionCuDNN() : -1;
      // TODO(eqy): remove this once cuDNN fixes 64-bit depthwise support, first broken in 9.11x
      if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) {
        if (cudnn_version < 0 || cudnn_version > 91000) {
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@ -50,35 +50,18 @@ static inline bool parseLinearFlatten3d() {
 // `_flatten_nd_linear` flattens all but the last dimension of the input tensor
 // before passing it to linear operation
 static inline Tensor _flatten_nd_linear(const Tensor& input, const Tensor& weight, const Tensor& bias) {
-  const auto input_sizes = input.sym_sizes();
-
-  const auto result_flattened = [&]() -> Tensor {
-    const auto input_ncols = input_sizes.back();
-    const auto input_flattened_nrows = [&]() -> c10::SymInt {
-      // can't use -1 in reshape because it errors when a dimension is 0
-      auto flattened_nrows = c10::SymInt{1};
-      for (const auto& size : input_sizes.slice(0, input_sizes.size() - 1)) {
-        flattened_nrows *= size;
-      }
-      return flattened_nrows;
-    }();
-
-    const auto input_flattened = input.view_symint({input_flattened_nrows, input_ncols});
-    if (weight.layout() == c10::kStrided) {
-      return at::addmm(bias, input_flattened, weight.t());
-    } else {
-      // weight is sparse, and addmm for sparse expects matmul lhs to be sparse,
-      // so we transpose the problem.
-      // NOTE: at::matmul handles (dense @ sparse) similarly.
-      const auto bias_t = (bias.dim() >= 2) ? bias.mT() : bias.unsqueeze(-1);
-      return at::addmm(bias_t, weight, input_flattened.t()).t();
+    const auto input_sizes = input.sym_sizes();
+    // can't use -1 in reshape because it errors when a dimension is 0
+    c10::SymInt flattened_dim = 1;
+    for (int64_t i = 0, ndim = input_sizes.size(); i < ndim - 1; ++i) {
+      flattened_dim = flattened_dim * input_sizes[i];
    }
-  }();
-
-  // Unflatten flattened row dims
-  auto result_sizes = c10::SymDimVector{input_sizes.begin(), input_sizes.end()};
-  result_sizes.back() = result_flattened.sym_size(1);
-  return result_flattened.view_symint(result_sizes);
+    auto inp_reshape = input.reshape_symint({flattened_dim, input_sizes.at(input_sizes.size() -1)});
+    const auto result = at::addmm(bias, inp_reshape, weight.t());
+    auto new_size = input_sizes.slice(0, input_sizes.size() - 1);
+    c10::SymDimVector sizes_vec(new_size.begin(), new_size.end());
+    sizes_vec.push_back(result.sym_size(1));
+    return result.view_symint(sizes_vec);
 }


@ -107,23 +90,15 @@ Tensor linear(const Tensor& input, const Tensor& weight, const std::optional<Ten
    // Fused op is marginally faster.
    return at::addmm(*bias, input, weight.t());
  }
-
-  const auto is_bias_likely_fusable = (
-      bias->defined() &&
-      // cuBLASLt: will fuse in the epilogue without copies
-      // when input/weight/bias are all strided.
-      // When weight is not strided, bias will not be fused,
-      // but we can still dispatch here to avoid at::matmul
-      // path which will probably use a very similar
-      // flattening optimization.
-      ((bias->dim() == 1 || bias->squeeze().dim() == 1) && bias->is_contiguous_or_false())
-  );
-  if (is_bias_likely_fusable && !input.is_xla()) {
-    // Also hit the fused path for contiguous nD input, if not using xla
+  if (bias->defined() && !input.is_xla()) {
+    // Also hit the fused path for contiguous 3D input, if not using xla
    // backend. Reshaping/flattening has some performance implications on xla.
-    if (input.is_contiguous_or_false()) {
+    bool is_contiguous = input.is_contiguous_or_false();
+    if (is_contiguous && input_dim == 3) {
      return _flatten_nd_linear(input, weight, *bias);
-    } else if (parseLinearFlatten3d()) {
+    } else if (is_contiguous && input.layout() == c10::kStrided && weight.layout() == c10::kStrided && bias->dim() == 1) {
+      return _flatten_nd_linear(input, weight, *bias);
+    } else if (parseLinearFlatten3d() && input_dim == 3) {
      // If user forces flattening via env var
      const Tensor input_cont = input.contiguous();
      return _flatten_nd_linear(input_cont, weight, *bias);
--- a/aten/src/ATen/native/cpu/Reduce.h
+++ b/aten/src/ATen/native/cpu/Reduce.h
@ -247,8 +247,8 @@ void binary_kernel_reduce(TensorIteratorBase& iter, ops_t ops, init_t init) {
  });
 }

-template <typename func_t, typename vec_func_t>
-void binary_kernel_reduce_vec(TensorIteratorBase& iter, func_t op, vec_func_t vop, double ident = 0) {
+template <typename func_t, typename vec_func_t, typename ident_t = double>
+void binary_kernel_reduce_vec(TensorIteratorBase& iter, func_t op, vec_func_t vop, ident_t ident = static_cast<ident_t>(0)) {
  using traits = binary_function_traits<func_t>;
  static_assert(
    all_same<
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@ -339,33 +339,13 @@ void or_kernel_impl(TensorIterator& iter) {
  }
 }

-template<typename scalar_t>
-struct MinValuesOps: public at::native::MinOps<scalar_t> {
-  using arg_t = typename MinOps<scalar_t>::arg_t;
-  static scalar_t project(arg_t arg) {
-    return arg.first;
-  }
-};
-
 void min_values_kernel_impl(TensorIterator& iter) {
-  // This case is special because of Vectorized<int64_t> does not
-  // handle upper_bound<int64_t>().
-  // See: https://github.com/pytorch/pytorch/issues/43254
-  if (iter.dtype() == kLong || iter.dtype() == kUInt64) {
-    AT_DISPATCH_V2(iter.dtype(), "min_values_cpu", AT_WRAP([&iter] {
-      binary_kernel_reduce(
-        iter,
-        MinValuesOps<scalar_t>{},
-        std::pair<scalar_t, int64_t>(upper_bound<scalar_t>(), -1));
-    }), kLong, kUInt64);
-    return;
-  }
  AT_DISPATCH_V2(iter.dtype(), "min_values_cpu", AT_WRAP([&iter] {
    binary_kernel_reduce_vec(
      iter,
      [](scalar_t a, scalar_t b) -> scalar_t { return min_impl(a, b); },
      [](Vectorized<scalar_t> a, Vectorized<scalar_t> b) { return minimum(a, b); },
-      static_cast<double>(upper_bound<scalar_t>()));
+      upper_bound<scalar_t>());
  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
 }

--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -147,24 +147,14 @@ static bool isGloballyDisabledAddmmCudaLt(const at::Device& device) {
 /*
 * Check whether for the given input we want to enable the Lt interface
 */
-static bool isInputCompliesAddmmCudaLt(
-    Tensor& result,
-    const Tensor& self,
-    const Tensor& mat1,
-    const Tensor& mat2,
-    const Scalar& beta,
-    const Scalar& alpha,
-    Activation activation
-) {
-  #ifdef USE_ROCM
+static bool isInputCompliesAddmmCudaLt(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha) {
  // Implies 2D bias which we currently not send through Lt.
  // TODO: this check is done pre col-major input preparation,
  // so, this condition can be ralexed in cases when a col-major
  // copy of result is needed.
-  if (self.is_same(result) || self.dim() == 2) {
+  if (result.is_same(self)) {
    return false;
  }
-  #endif

  #if defined(USE_ROCM) && ROCM_VERSION == 60400
  // hipblaslt TT fp32 regression on ROCm 6.4, cannot use
@ -179,33 +169,13 @@ static bool isInputCompliesAddmmCudaLt(
  #if defined(CUDA_VERSION) || defined(USE_ROCM)
  const auto scalar_type = mat1.scalar_type();
  return (beta.toComplexDouble() == 1.0
-    // NOTE: row-major result is important when bias is 1D.
-    // This is because Lt broadcasts 1D bias over the columns
-    // while the aten::addmm API broadcasts it over the rows,
-    // and this is in conjuction with the data preparation
-    // procedure that does not transpose arguments with
-    // col-major result. For col-major result we need
-    // to explicitly transpose the problem so that bias is
-    // correctly applied.
-    // TODO: enable col-major result if needed.
-    // TODO: no need to check result's layout when
-    // !result.is_same(self) and self.dim() == 2, because
-    // self needs to be copied into result and the bias ptr
-    // will be ignored.
    && result.dim() == 2 && result.is_contiguous()
+    // Conditions for bias to be fusable
    && (
-      ( // Conditions for bias to be fusable -- implies direct Lt path without copies.
-        self.is_contiguous() &&
-        // NOTE: fine to have 1-len dims to the left from the right-most one
-        (self.dim() == 1 || self.squeeze().dim() == 1) &&
-        self.sizes().back() == mat2_sizes[1]
-      )
-      || ( // 2D bias restrictions. self.is_contiguous() is implicit when result.is_same(self),
-        // and we need to copy self into result otherwise, so the self's layout becomes irrelevant.
-        // See also TODO from above.
-        activation != Activation::None && // Lt is faster when activation is fused
-        (self.dim() == 2 && at::is_expandable_to(self.sizes(), {mat1_sizes[0], mat2_sizes[1]}))
-      )
+      self.is_contiguous() &&
+      // NOTE: fine to have 1-len dims to the left from the right-most one
+      (self.dim() == 1 || self.squeeze().dim() == 1) &&
+      self.sizes().back() == mat2_sizes[1]
    )
    && ( // some dtype restrictions
      #ifndef USE_ROCM
@ -300,16 +270,7 @@ bool launchGemmAndBiasCublasLt(
    const Scalar& alpha,
    Activation activation = Activation::None
 ) {
-  // We apply bias in the epilogue only when it is 1D,
-  // or when it can be squeezed to 1D.
-  // self_ptr == nullptr implies ignore bias epilogue
-  // and use standard gemm-like API.
-  const auto* self_ptr = [&]() -> auto {
-    if (self.dim() == 1 || self.squeeze().dim() == 1) {
-      return self.const_data_ptr<scalar_t>();
-    }
-    return static_cast<const scalar_t*>(nullptr);
-  }();
+  const auto* self_ptr = self.const_data_ptr<scalar_t>();

  const auto tuning_ctx = at::cuda::tunable::getTuningContext();
  if (tuning_ctx->IsTunableOpEnabled()) {
@ -395,7 +356,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
  disable_addmm_cuda_lt = isGloballyDisabledAddmmCudaLt(self.device()) || disable_addmm_cuda_lt;
  #endif
  // Condition on the input
-  disable_addmm_cuda_lt = !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha, activation) || disable_addmm_cuda_lt;
+  disable_addmm_cuda_lt = !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha) || disable_addmm_cuda_lt;
  // }

  at::ScalarType scalar_type = mat1.scalar_type();
@ -405,20 +366,19 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
  if (!result.is_same(self)) {
    at::native::resize_output(result, {mat1.sizes()[0], mat2.sizes()[1]});

-    // We use bias ptr in the Lt path only when bias is 1D
-    const auto use_bias_ptr_lt = (self.dim() == 1) && !disable_addmm_cuda_lt;
    const auto self_maybe_expanded = [&]() -> c10::MaybeOwned<Tensor> {
-      if (!use_bias_ptr_lt) {
-        // We do expand self even before
+      if (disable_addmm_cuda_lt) {
+        // When in non-Lt path we do expand self even before
        // check for beta != 0.0 to make sure that
        // test_sparse_csr.py::TestSparseCSRCUDA::test_addmm_errors_*
        // runs green.
        return expand_size(self, result.sizes(), "addmm");
      }
+      // copy next, should broadcast
      return c10::MaybeOwned<Tensor>::borrowed(self);
    }();
-    // We do not copy bias only when we need the bias ptr
-    if (beta.toComplexDouble() != 0.0 && !use_bias_ptr_lt) {
+    // We copy bias when in the non-Lt path
+    if (beta.toComplexDouble() != 0.0 && disable_addmm_cuda_lt) {
      // NOTE: self should broadcast over result
      at::native::copy_(result, *self_maybe_expanded);
    }
--- a/aten/src/ATen/native/cuda/CUDALoops.cuh
+++ b/aten/src/ATen/native/cuda/CUDALoops.cuh
@ -884,69 +884,6 @@ struct type_specialized_kernel_launcher {
  }
 };

-template <int arg_index>
-struct type_specialized_broadcast_kernel_launcher {
-  template <
-      typename func_t,
-      typename array_t,
-      typename dtypes_t,
-      typename calc_t>
-  static void apply(
-      int64_t numel,
-      func_t f,
-      array_t data,
-      dtypes_t dtypes,
-      calc_t offset_calc) {
-        using traits = function_traits<func_t>;
-        using ret_t = typename traits::result_type;
-        using arg0_t = typename traits::template arg<0>::type;
-        using arg1_t = typename traits::template arg<1>::type;
-        if (dtypes[0] == rt_binary_specializations[arg_index][0] &&
-          dtypes[1] == rt_binary_specializations[arg_index][1] &&
-          dtypes[2] == rt_binary_specializations[arg_index][2]) {
-            using ret_cpp_t = c10::impl::ScalarTypeToCPPTypeT<rt_binary_specializations[arg_index][0]>;
-            using arg0_cpp_t = c10::impl::ScalarTypeToCPPTypeT<rt_binary_specializations[arg_index][1]>;
-            using arg1_cpp_t = c10::impl::ScalarTypeToCPPTypeT<rt_binary_specializations[arg_index][2]>;
-            constexpr int grp_sz = 128;
-            launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
-              if (unrl) {
-                auto offsets0 = offset_calc.get(idx);
-                auto offsets1 = offset_calc.get(idx + grp_sz);
-                auto offsets2 = offset_calc.get(idx + grp_sz * 2);
-                auto offsets3 = offset_calc.get(idx + grp_sz * 3);
-                void* out0 = data[0] + offsets0[0];
-                void* out1 = data[0] + offsets1[0];
-                void* out2 = data[0] + offsets2[0];
-                void* out3 = data[0] + offsets3[0];
-                auto u = c10::load<arg0_cpp_t>(data[1] + offsets0[1]);
-                auto v = c10::load<arg1_cpp_t>(data[2] + offsets0[2]);
-                ret_t result0 = f(c10::convert<arg0_t>(u), c10::convert<arg1_t>(v));
-                auto u1 = c10::load<arg0_cpp_t>(data[1] + offsets1[1]);
-                auto v1 = c10::load<arg1_cpp_t>(data[2]+ offsets1[2]);
-                ret_t result1 = f(c10::convert<arg0_t>(u1), c10::convert<arg1_t>(v1));
-                auto u2 = c10::load<arg0_cpp_t>(data[1] + offsets2[1]);
-                auto v2 = c10::load<arg1_cpp_t>(data[2] + offsets2[2]);
-                ret_t result2 = f(c10::convert<arg0_t>(u2), c10::convert<arg1_t>(v2));
-                auto u3 = c10::load<arg0_cpp_t>(data[1] + offsets3[1]);
-                auto v3 = c10::load<arg1_cpp_t>(data[2] + offsets3[2]);
-                ret_t result3 = f(c10::convert<arg0_t>(u3), c10::convert<arg1_t>(v3));
-                *(ret_cpp_t*)out0 = c10::convert<ret_cpp_t>(result0);
-                *(ret_cpp_t*)out1 = c10::convert<ret_cpp_t>(result1);
-                *(ret_cpp_t*)out2 = c10::convert<ret_cpp_t>(result2);
-                *(ret_cpp_t*)out3 = c10::convert<ret_cpp_t>(result3);
-              } else {
-                auto offsets = offset_calc.get(idx);
-                void* out = data[0] + offsets[0];
-                auto u = c10::load<arg0_cpp_t>(data[1] + offsets[1]);
-                auto v = c10::load<arg1_cpp_t>(data[2] + offsets[2]);
-                ret_t result = f(c10::convert<arg0_t>(u), c10::convert<arg1_t>(v));
-                *(ret_cpp_t*)out = c10::convert<ret_cpp_t>(result);
-              }
-            });
-        }
-      }
-};
-
 } // namespace
 #endif

@ -1065,32 +1002,6 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
    }
    auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter);
 #ifdef USE_ROCM
-    if (check_binary_rt_types_for_specialization(iter)) {
-      // constexpr to reduce the amount of kernels generated for
-      // broadcast elementwise with mexed dtypes and limit which functors are actually
-      // applied to the load and store at compile time.
-      using func_tuple = typename traits::ArgsTuple;
-      if constexpr (
-        std::is_same_v<float, arg0_t> && traits::arity == 2 &&
-        check_binary_functor_types_for_specialization<
-          func_tuple,
-          float,
-          float,
-          traits::arity,
-          /*arg_num=*/0>::check()) {
-            memory::detail::static_unroll<
-              type_specialized_broadcast_kernel_launcher,
-              rt_binary_specializations.size()>::with_args(
-                numel,
-                f,
-                data,
-                dtypes,
-                offset_calc
-            );
-            return;
-      }
-    }
-
    constexpr int grp_sz = 128;
    launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
      if (unrl) {
--- a/aten/src/ATen/native/mkldnn/xpu/detail/QConv.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/QConv.cpp
@ -133,7 +133,7 @@ at::Tensor quantized_convolution(
  // supported in conv.
  mask_weight = weight_zero_points.numel() > 1 ? 1 : 0;
  if (groups > 1 && weight_zero_points.numel() > 1)
-    mask_weight = (1 << 0) | (1 << 1); // 2^0 (group) | 2^1 (output channel)
+    mask_weight = (2 ^ 0) | (2 ^ 1); // 2^0 (group) | 2^1 (output channel)
  dnnl::primitive_attr pattr;

  bool src_need_zp = (act_zero_point != 0);
--- a/aten/src/ATen/native/mps/operations/Blas.mm
+++ b/aten/src/ATen/native/mps/operations/Blas.mm
@ -141,9 +141,6 @@ static Tensor& addmv_out_mps_impl(const Tensor& self,
  };

  MPSStream* stream = at::mps::getCurrentMPSStream();
-  if (result.numel() == 0) {
-    return result;
-  }
  Tensor matMulVec = at::mm(mat, vec.unsqueeze(1)).squeeze(1);

  @autoreleasepool {
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -2803,7 +2803,7 @@
 - func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
  dispatch:
-    CPU, CUDA, MPS, MTIA: floor_divide_out
+    CPU, CUDA, MPS: floor_divide_out
    SparseCPU, SparseCUDA, SparseMPS: floor_divide_out_sparse_zerodim

 - func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
@ -4292,7 +4292,6 @@
  dispatch:
    SparseCPU: sparse_sparse_matmul_cpu
    SparseCUDA: sparse_sparse_matmul_cuda
-    SparseMPS: sparse_sparse_matmul_mps
  autogen: _sparse_sparse_matmul.out

 - func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
@ -4384,7 +4383,7 @@
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: mv
-    SparseCPU, SparseCUDA, SparseMPS: mv_sparse
+    SparseCPU, SparseCUDA: mv_sparse

 - func: mv.out(Tensor self, Tensor vec, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
@ -9833,7 +9832,7 @@
  structured_delegate: erfinv.out
  variants: method, function
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse
+    SparseCPU, SparseCUDA: erfinv_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr
  tags: pointwise

@ -9842,7 +9841,7 @@
  structured_delegate: erfinv.out
  variants: method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse_
+    SparseCPU, SparseCUDA: erfinv_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr_
  tags: pointwise

@ -9852,7 +9851,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS: erfinv_out
-    SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse_out
+    SparseCPU, SparseCUDA: erfinv_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr_out
  tags: pointwise

--- a/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
+++ b/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
@ -10,10 +10,6 @@
 #include <ATen/NativeFunctions.h>
 #else
 #include <ATen/ops/_coalesce_native.h>
-#include <ATen/ops/repeat_interleave_native.h>
-#include <ATen/ops/cumsum.h>
-#include <ATen/ops/_sparse_sparse_matmul_native.h>
-#include <ATen/ops/_sparse_coo_tensor_unsafe.h>
 #include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
 #include <ATen/ops/cat.h>
 #include <ATen/ops/add_native.h>
@ -892,114 +888,5 @@ static void sparse_mask_intersection_out_mps_kernel(
      /*coalesce_mask=*/false);
 }

-Tensor sparse_sparse_matmul_mps(const Tensor& mat1_, const Tensor& mat2_) {
-  TORCH_CHECK(mat1_.is_sparse() && mat2_.is_sparse(),
-              "sparse_sparse_matmul_mps: both inputs must be sparse COO tensors");
-  TORCH_CHECK(mat1_.is_mps() && mat2_.is_mps(),
-              "sparse_sparse_matmul_mps: both inputs must be on MPS device");
-  TORCH_CHECK(mat1_.dim() == 2 && mat2_.dim() == 2,
-              "sparse_sparse_matmul_mps: both inputs must be 2D matrices");
-  TORCH_CHECK(mat1_.dense_dim() == 0 && mat2_.dense_dim() == 0,
-              "sparse_sparse_matmul_mps: only scalar values supported (dense_dim == 0)");
-  TORCH_CHECK(mat1_.size(1) == mat2_.size(0),
-              "mat1 and mat2 shapes cannot be multiplied (", mat1_.size(0), "x", mat1_.size(1), " and ", mat2_.size(0), "x", mat2_.size(1), ")");
-  TORCH_CHECK(mat1_.scalar_type() == mat2_.scalar_type(),
-              "sparse_sparse_matmul_mps: mat1 dtype ", mat1_.scalar_type(),
-              " does not match mat2 dtype ", mat2_.scalar_type());
-
-  const auto device = mat1_.device();
-
-  auto A = mat1_.coalesce();
-  auto B = mat2_.coalesce();
-
-  const auto I = A.size(0);
-  const auto K = A.size(1);
-  const auto N = B.size(1);
-
-  const auto nnzA = A._nnz();
-  const auto nnzB = B._nnz();
-
-  // Early empty result, return an empty, coalesced tensor
-  if (I == 0 || N == 0 || K == 0 || nnzA == 0 || nnzB == 0) {
-    auto empty_idx = at::empty({2, 0}, at::device(device).dtype(at::kLong));
-    auto empty_val = at::empty({0}, at::device(device).dtype(mat1_.scalar_type()));
-    auto out = _sparse_coo_tensor_unsafe(empty_idx, empty_val, {I, N}, mat1_.options());
-    out._coalesced_(true);
-    return out;
-  }
-
-  const auto computeDtype = at::result_type(mat1_, mat2_);
-
-  auto A_idx = A._indices().contiguous();
-  auto A_val = A._values().to(computeDtype).contiguous();
-  auto A_i = A_idx.select(0, 0).contiguous();
-  auto A_k = A_idx.select(0, 1).contiguous();
-
-  auto B_idx = B._indices().contiguous();
-  auto B_val = B._values().to(computeDtype).contiguous();
-  auto B_k = B_idx.select(0, 0).contiguous();
-  auto B_j = B_idx.select(0, 1).contiguous();
-
-  // csr-style row pointers for B by k (the shared dimension)
-  Tensor row_ptr_B;
-  {
-    auto batch_ptr = at::tensor({0LL, nnzB}, at::device(device).dtype(at::kLong));
-    row_ptr_B = at::empty({K + 1}, at::device(device).dtype(at::kLong));
-    build_row_ptr_per_batch_mps(B_k, batch_ptr, /*B=*/1, /*I=*/K, row_ptr_B);
-  }
-
-  auto row_ptr_B_lo = row_ptr_B.narrow(0, 0, K);
-  auto row_ptr_B_hi = row_ptr_B.narrow(0, 1, K);
-  auto deg_B = row_ptr_B_hi.sub(row_ptr_B_lo);
-
-  auto counts = deg_B.index_select(0, A_k);
-
-  const int64_t P = counts.sum().item<int64_t>();
-  if (P == 0) {
-    auto empty_idx = at::empty({2, 0}, at::device(device).dtype(at::kLong));
-    auto empty_val = at::empty({0}, at::device(device).dtype(mat1_.scalar_type()));
-    auto out = _sparse_coo_tensor_unsafe(empty_idx, empty_val, {I, N}, mat1_.options());
-    out._coalesced_(true);
-    return out;
-  }
-
-  auto group_ids = repeat_interleave_mps(counts);
-
-  // exclusive cumsum of counts
-  auto offsets = cumsum(counts, /*dim=*/0).sub(counts);
-  auto offsets_gather = offsets.index_select(0, group_ids);
-  auto within = at::arange(P, at::device(device).dtype(at::kLong)).sub(offsets_gather);
-
-  // Map each output element to its source B row and position
-  auto k_per_out = A_k.index_select(0, group_ids);
-  auto start_in_B = row_ptr_B.index_select(0, k_per_out);
-  auto seg_index = start_in_B.add(within);
-
-  // Assemble candidate coo pairs and values
-  auto i_out = A_i.index_select(0, group_ids).contiguous();
-  auto j_out = B_j.index_select(0, seg_index).contiguous();
-  auto vA_out = A_val.index_select(0, group_ids).contiguous();
-  auto vB_out = B_val.index_select(0, seg_index).contiguous();
-  auto v_out = vA_out.mul(vB_out);
-
-  // build (2, P) indices
-  auto out_indices = at::empty({2, P}, at::device(device).dtype(at::kLong)).contiguous();
-  out_indices.select(0, 0).copy_(i_out);
-  out_indices.select(0, 1).copy_(j_out);
-
-  auto result = _sparse_coo_tensor_unsafe(
-      out_indices, v_out, {I, N}, mat1_.options().dtype(computeDtype));
-
-  result = result.coalesce();
-
-  if (result.scalar_type() != mat1_.scalar_type()) {
-    auto cast_vals = result._values().to(mat1_.scalar_type());
-    auto out = _sparse_coo_tensor_unsafe(result._indices(), cast_vals, {I, N}, mat1_.options());
-    out._coalesced_(true);
-    return out;
-  }
-  return result;
-}
-
 REGISTER_MPS_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_mps_kernel);
 } // namespace at::native
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
@ -478,7 +478,7 @@ bool check_cudnn_tensor_shapes(sdp_params const& params, bool debug) {
  const auto s_k = params.key.sym_size(2);
  const auto d_qk = params.query.sym_size(3);
  const auto d_v = params.value.sym_size(3);
-  long cudnn_version = at::detail::getCUDAHooks().versionRuntimeCuDNN();
+  long cudnn_version = at::detail::getCUDAHooks().versionCuDNN();
  if (cudnn_version < 8903) {
    if (debug) {
      TORCH_WARN("SDPA fprop requires cudnn 8.9.3 or higher");
@ -709,7 +709,7 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
  return false;
 #endif
 #if defined(CUDNN_VERSION)
-  static auto cudnn_version = at::detail::getCUDAHooks().versionRuntimeCuDNN();
+  static auto cudnn_version = cudnnGetVersion();
  if (params.dropout > 0.0 && cudnn_version > 91100 && cudnn_version < 91400) {
    if (debug) {
      TORCH_WARN(CUDNN_VERSION, " cuDNN version does not support droppout in SDPA (9.11 - 9.13).");
--- a/benchmarks/sparse/spmm.py
+++ b/benchmarks/sparse/spmm.py
@ -52,18 +52,19 @@ def test_sparse_coo_and_csr(m, n, k, nnz, test_count):
        start.record()
        coo.matmul(mat)
        stop.record()
+
        times.append(start.elapsed_time(stop))

-    coo_mean_time = sum(times) / len(times)
+        coo_mean_time = sum(times) / len(times)

-    times = []
-    for _ in range(test_count):
-        start.record()
-        csr.matmul(mat)
-        stop.record()
-        times.append(start.elapsed_time(stop))
+        times = []
+        for _ in range(test_count):
+            start.record()
+            csr.matmul(mat)
+            stop.record()
+            times.append(start.elapsed_time(stop))

-    csr_mean_time = sum(times) / len(times)
+            csr_mean_time = sum(times) / len(times)

    return coo_mean_time, csr_mean_time

--- a/c10/core/AutogradState.h
+++ b/c10/core/AutogradState.h
@ -1,8 +1,6 @@
 #pragma once

-#include <c10/core/SafePyObject.h>
 #include <c10/macros/Export.h>
-#include <optional>

 namespace c10 {

@ -17,8 +15,7 @@ struct C10_API AutogradState {
      bool inference_mode,
      bool fw_grad_mode,
      bool multithreading_enabled)
-      : graph_exec_group_(std::nullopt),
-        grad_mode_(grad_mode),
+      : grad_mode_(grad_mode),
        inference_mode_(inference_mode),
        fw_grad_mode_(fw_grad_mode),
        multithreading_enabled_(multithreading_enabled),
@ -44,10 +41,6 @@ struct C10_API AutogradState {
    view_replay_enabled_ = view_replay_enabled;
  }

-  void set_graph_exec_group(std::optional<SafePyObject> group) {
-    graph_exec_group_ = std::move(group);
-  }
-
  bool get_grad_mode() const {
    return grad_mode_;
  }
@ -68,12 +61,7 @@ struct C10_API AutogradState {
    return view_replay_enabled_;
  }

-  const std::optional<SafePyObject>& get_graph_exec_group() const {
-    return graph_exec_group_;
-  }
-
 private:
-  std::optional<SafePyObject> graph_exec_group_;
  bool grad_mode_ : 1;
  bool inference_mode_ : 1;
  bool fw_grad_mode_ : 1;
--- a/c10/core/CachingDeviceAllocator.h
+++ b/c10/core/CachingDeviceAllocator.h
@ -96,10 +96,6 @@ struct C10_API DeviceAllocator : public c10::Allocator {

  // Resets peak memory usage statistics for the specified device
  virtual void resetPeakStats(c10::DeviceIndex device) = 0;
-
-  // Return the free memory size and total memory size in bytes for the
-  // specified device.
-  virtual std::pair<size_t, size_t> getMemoryInfo(c10::DeviceIndex device) = 0;
 };

 // This function is used to get the DeviceAllocator for a specific device type
--- a/c10/cuda/CUDAAllocatorConfig.cpp
+++ b/c10/cuda/CUDAAllocatorConfig.cpp
@ -106,9 +106,6 @@ void CUDAAllocatorConfig::parseArgs(const std::string& env) {
    } else if (key == "graph_capture_record_stream_reuse") {
      i = parseGraphCaptureRecordStreamReuse(tokenizer, i);
      used_native_specific_option = true;
-    } else if (key == "per_process_memory_fraction") {
-      i = parsePerProcessMemoryFraction(tokenizer, i);
-      used_native_specific_option = true;
    } else {
      const auto& keys =
          c10::CachingAllocator::AcceleratorAllocatorConfig::getKeys();
@ -149,18 +146,6 @@ size_t CUDAAllocatorConfig::parseGraphCaptureRecordStreamReuse(
  return i;
 }

-double CUDAAllocatorConfig::parsePerProcessMemoryFraction(
-    const c10::CachingAllocator::ConfigTokenizer& tokenizer,
-    size_t i) {
-  tokenizer.checkToken(++i, ":");
-  double val_env = tokenizer.toDouble(++i);
-  TORCH_CHECK_VALUE(
-      val_env >= 0.0 && val_env <= 1.0,
-      "per_process_memory_fraction is invalid, set it in [0.0, 1.0]");
-  m_per_process_memory_fraction = val_env;
-  return i;
-}
-
 size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads(
    const c10::CachingAllocator::ConfigTokenizer& tokenizer,
    size_t i) {
--- a/c10/cuda/CUDAAllocatorConfig.h
+++ b/c10/cuda/CUDAAllocatorConfig.h
@ -61,10 +61,6 @@ class C10_CUDA_API CUDAAllocatorConfig {
    return instance().m_graph_capture_record_stream_reuse;
  }

-  static double per_process_memory_fraction() {
-    return instance().m_per_process_memory_fraction;
-  }
-
  /** Pinned memory allocator settings */
  static bool pinned_use_cuda_host_register() {
    return instance().m_pinned_use_cuda_host_register;
@ -156,8 +152,7 @@ class C10_CUDA_API CUDAAllocatorConfig {
        "pinned_use_hip_host_register",
        "graph_capture_record_stream_reuse",
        "pinned_reserve_segment_size_mb",
-        "pinned_num_register_threads",
-        "per_process_memory_fraction"};
+        "pinned_num_register_threads"};
    return keys;
  }

@ -182,9 +177,6 @@ class C10_CUDA_API CUDAAllocatorConfig {
  size_t parseGraphCaptureRecordStreamReuse(
      const c10::CachingAllocator::ConfigTokenizer& tokenizer,
      size_t i);
-  double parsePerProcessMemoryFraction(
-      const c10::CachingAllocator::ConfigTokenizer& tokenizer,
-      size_t i);

  std::atomic<size_t> m_pinned_num_register_threads{1};
  std::atomic<size_t> m_pinned_reserve_segment_size_mb{0};
@ -197,7 +189,6 @@ class C10_CUDA_API CUDAAllocatorConfig {
  std::atomic<bool> m_release_lock_on_cudamalloc{false};
  std::atomic<bool> m_pinned_use_cuda_host_register{false};
  std::atomic<bool> m_graph_capture_record_stream_reuse{false};
-  std::atomic<double> m_per_process_memory_fraction{1.0};
 };

 // Keep this for backwards compatibility
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@ -1100,7 +1100,7 @@ class RingBuffer {
 } // anonymous namespace
 } // namespace Native

-static std::string reportProcessMemoryInfo(const cudaDeviceProp& prop) {
+static std::string reportProcessMemoryInfo(c10::DeviceIndex device) {
 #ifdef PYTORCH_C10_DRIVER_API_SUPPORTED
  void* nvml_handle = DriverAPI::get_nvml_handle();
  if (!nvml_handle) {
@ -1111,6 +1111,9 @@ static std::string reportProcessMemoryInfo(const cudaDeviceProp& prop) {
    return true;
  }();

+  cudaDeviceProp prop{};
+  C10_CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
+
  // NOLINTNEXTLINE(*-c-arrays)
  char pci_id[80];
  snprintf(
@ -1212,16 +1215,14 @@ class DeviceCachingAllocator {
  // record used memory.
  size_t total_allocated_memory = 0;

-  cudaDeviceProp device_prop;
-
-  // maximum amount of memory that device is allowed to
-  // allocate. This is set iff memory fraction is less than 1
-  std::optional<size_t> allowed_memory_maximum{std::nullopt};
+  size_t allowed_memory_maximum = 0;

  // all live expandable segments
  std::vector<ExpandableSegment*> expandable_segments_;
  std::vector<c10::DeviceIndex> devices_with_peer_access_;

+  bool set_fraction = false;
+
  bool record_history = false;

  std::atomic<CreateContextFn> context_recorder_;
@ -1263,9 +1264,6 @@ class DeviceCachingAllocator {
      : device_id(id),
        large_blocks(/*small=*/false),
        small_blocks(/*small=*/true) {
-    C10_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, id));
-
-    setMemoryFraction(CUDAAllocatorConfig::per_process_memory_fraction());
    stats.max_split_size =
        static_cast<int64_t>(AcceleratorAllocatorConfig::max_split_size());
    context_recorder_.store(nullptr);
@ -1401,7 +1399,7 @@ class DeviceCachingAllocator {
    if (!block_found) {
      // Do garbage collection if the flag is set.
      if (C10_UNLIKELY(
-              allowed_memory_maximum.has_value() &&
+              set_fraction &&
              AcceleratorAllocatorConfig::garbage_collection_threshold() >
                  0.0)) {
        garbage_collect_cached_blocks(context);
@ -1458,12 +1456,11 @@ class DeviceCachingAllocator {
      C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
      std::string allowed_info;

-      if (allowed_memory_maximum.has_value()) {
-        allowed_info =
-            format_size(allowed_memory_maximum.value()) + " allowed; ";
+      if (set_fraction) {
+        allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
      }

-      std::string proc_info = reportProcessMemoryInfo(device_prop);
+      std::string proc_info = reportProcessMemoryInfo(device_id);

      record_trace(
          TraceEntry::OOM,
@ -1521,7 +1518,7 @@ class DeviceCachingAllocator {
      for (const auto& obs : observers_local) {
        obs(device_id,
            alloc_size,
-            allowed_memory_maximum.value_or(device_total),
+            set_fraction ? allowed_memory_maximum : device_total,
            device_free);
      }

@ -2018,26 +2015,25 @@ class DeviceCachingAllocator {

  /** get memory fraction limiting maximum allocated memory **/
  double getMemoryFraction() {
-    if (!allowed_memory_maximum.has_value()) {
+    if (!set_fraction) {
      return 1.0;
    }

-    return static_cast<double>(allowed_memory_maximum.value()) /
-        static_cast<double>(device_prop.totalGlobalMem);
+    size_t device_free = 0;
+    size_t device_total = 0;
+    C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
+    return static_cast<double>(allowed_memory_maximum) /
+        static_cast<double>(device_total);
  }

  /** set memory fraction to limit maximum allocated memory **/
  void setMemoryFraction(double fraction) {
-    TORCH_CHECK(
-        0 <= fraction && fraction <= 1,
-        "invalid fraction:",
-        fraction,
-        ". Please set within [0, 1].");
-    allowed_memory_maximum = std::nullopt;
-    if (fraction < 1.0) {
-      allowed_memory_maximum = static_cast<size_t>(
-          fraction * static_cast<double>(device_prop.totalGlobalMem));
-    }
+    size_t device_free = 0;
+    size_t device_total = 0;
+    C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
+    allowed_memory_maximum =
+        static_cast<size_t>(fraction * static_cast<double>(device_total));
+    set_fraction = true;
  }

  /** get expandable segment size for all the streams on device **/
@ -3014,7 +3010,7 @@ class DeviceCachingAllocator {
    BlockPool& pool = *p.pool;

    if (C10_UNLIKELY(
-            allowed_memory_maximum.has_value() &&
+            set_fraction &&
            AcceleratorAllocatorConfig::garbage_collection_threshold() > 0.0)) {
      // Track block reuse interval only when garbage collection is enabled.
      ++pool.get_free_blocks_call_count;
@ -3087,7 +3083,7 @@ class DeviceCachingAllocator {

    size_t gc_threshold = static_cast<size_t>(
        AcceleratorAllocatorConfig::garbage_collection_threshold() *
-        static_cast<double>(allowed_memory_maximum.value()));
+        static_cast<double>(allowed_memory_maximum));
    // No need to trigger GC yet
    if (total_allocated_memory <= gc_threshold) {
      return;
@ -3165,8 +3161,8 @@ class DeviceCachingAllocator {

    bool active_pool =
        p.pool->owner_PrivatePool && p.pool->owner_PrivatePool->allocator();
-    if (allowed_memory_maximum.has_value() &&
-        total_allocated_memory + size > allowed_memory_maximum.value()) {
+    if (set_fraction &&
+        total_allocated_memory + size > allowed_memory_maximum) {
      p.err = cudaErrorMemoryAllocation;
      return false;
      // Temporarily disable checkpointing & cudagraphs internally
@ -3863,6 +3859,7 @@ class NativeCachingAllocator : public CUDAAllocator {
        "Allocator not initialized for device ",
        device,
        ": did you call init?");
+    C10_CUDA_CHECK(c10::cuda::SetDevice(device));
    return device_allocator[device]->getMemoryFraction();
  }

@ -3872,6 +3869,12 @@ class NativeCachingAllocator : public CUDAAllocator {
        "Allocator not initialized for device ",
        device,
        ": did you call init?");
+    TORCH_CHECK(
+        0 <= fraction && fraction <= 1,
+        "invalid fraction:",
+        fraction,
+        ". Please set within [0, 1].");
+    C10_CUDA_CHECK(c10::cuda::SetDevice(device));
    device_allocator[device]->setMemoryFraction(fraction);
  }

--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@ -2,7 +2,6 @@

 #include <c10/core/AllocatorConfig.h>
 #include <c10/core/CachingDeviceAllocator.h>
-#include <c10/cuda/CUDAAllocatorConfig.h>
 #include <c10/cuda/CUDAGraphsC10Utils.h>
 #include <c10/cuda/CUDAMacros.h>
 #include <c10/cuda/CUDAStream.h>
@ -345,13 +344,6 @@ class CUDAAllocator : public DeviceAllocator {
      c10::DeviceIndex device,
      std::shared_ptr<AllocatorState> pps) = 0;
  virtual std::string name() = 0;
-  std::pair<size_t, size_t> getMemoryInfo(c10::DeviceIndex device) override {
-    c10::DeviceGuard device_guard({at::kCUDA, device});
-    size_t free = 0;
-    size_t total = 0;
-    C10_CUDA_CHECK(cudaMemGetInfo(&free, &total));
-    return {free, total};
-  }
 };

 // Allocator object, statically initialized
--- a/c10/cuda/CUDAMallocAsyncAllocator.cpp
+++ b/c10/cuda/CUDAMallocAsyncAllocator.cpp
@ -427,6 +427,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
  // on the current device each later call sees.
  void init(int dev_count) override {
    static bool called = [](int dev_count) {
+      ;
      // Are there external guarantees init will be called before
      // any of the allocator's other functions?
      // std::lock_guard<std::mutex> lk(general_mutex);
--- a/c10/test/build.bzl
+++ b/c10/test/build.bzl
@ -66,15 +66,6 @@ def define_targets(rules):
        ],
    )

-    rules.cc_test(
-        name = "util/nofatal_test",
-        srcs = ["util/nofatal_test.cpp"],
-        deps = [
-            "//c10/util:base",
-            "@com_google_googletest//:gtest_main",
-        ],
-    )
-
    rules.cc_test(
        name = "util/ssize_test",
        srcs = ["util/ssize_test.cpp"],
--- a/c10/test/util/nofatal_test.cpp
+++ b/c10/test/util/nofatal_test.cpp
@ -1,53 +0,0 @@
-#include <gtest/gtest.h>
-
-#include <c10/util/Exception.h>
-#include <c10/util/Logging.h>
-
-namespace {
-template <typename T>
-inline void expectThrowsEq(T&& fn, const char* expected_msg) {
-  try {
-    std::forward<T>(fn)();
-  } catch (const c10::Error& e) {
-    EXPECT_TRUE(
-        std::string(e.what_without_backtrace()).find(expected_msg) !=
-        std::string::npos);
-    return;
-  }
-  ADD_FAILURE() << "Expected to throw exception with message \"" << expected_msg
-                << "\" but didn't throw";
-}
-} // namespace
-
-TEST(NofatalTest, TorchCheckComparisons) {
-  // quick make sure that no-op works as expected
-  TORCH_CHECK_EQ(1, 1) << "i am a silly message " << 1;
-  expectThrowsEq(
-      []() { TORCH_CHECK_EQ(1, 2) << "i am a silly message " << 1; },
-      "Check failed: 1 == 2 (1 vs. 2). i am a silly message 1");
-  expectThrowsEq(
-      []() { TORCH_CHECK_NE(2, 2); }, "Check failed: 2 != 2 (2 vs. 2).");
-  expectThrowsEq(
-      []() { TORCH_CHECK_LT(2, 2); }, "Check failed: 2 < 2 (2 vs. 2).");
-  expectThrowsEq(
-      []() { TORCH_CHECK_LE(3, 2); }, "Check failed: 3 <= 2 (3 vs. 2).");
-  expectThrowsEq(
-      []() { TORCH_CHECK_GT(2, 2); }, "Check failed: 2 > 2 (2 vs. 2).");
-  expectThrowsEq(
-      []() { TORCH_CHECK_GE(2, 3); }, "Check failed: 2 >= 3 (2 vs. 3).");
-  expectThrowsEq(
-      []() {
-        void* p = nullptr;
-        TORCH_CHECK_NOTNULL(p);
-      },
-      "Check failed: 'p' must be non NULL.");
-
-#if GTEST_HAS_DEATH_TEST
-#ifndef NDEBUG
-  // if dbg build, DCHECK should result in deth
-  EXPECT_DEATH(TORCH_DCHECK_EQ(1, 2), "Check failed");
-#else
-  TORCH_DCHECK_EQ(1, 2); // no-op
-#endif
-#endif // GTEST_HAS_DEATH_TEST
-}
--- a/c10/util/Exception.h
+++ b/c10/util/Exception.h
@ -702,98 +702,6 @@ namespace c10::detail {
 #define TORCH_CHECK_ARG(cond, argN, ...) \
  TORCH_CHECK(cond, "invalid argument ", argN, ": ", __VA_ARGS__)

-#ifndef FATAL_IF
-#ifdef C10_USE_GLOG
-#define FATAL_IF(condition)                                              \
-  condition ? (void)0                                                    \
-            : ::c10::LoggerVoidify() &                                   \
-          ::c10::MessageLogger(__FILE__, __LINE__, ::google::GLOG_FATAL) \
-              .stream()
-#else
-#define FATAL_IF(condition)            \
-  condition ? (void)0                  \
-            : ::c10::LoggerVoidify() & \
-          ::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_FATAL).stream()
-#endif
-#endif
-
-#ifndef NON_FATAL_IF
-#ifdef C10_USE_GLOG
-#define NON_FATAL_IF(condition)                                \
-  condition ? (void)0                                          \
-            : ::c10::LoggerVoidify() &                         \
-          ::c10::MessageLogger(                                \
-              __FILE__, __LINE__, ::google::GLOG_FATAL, false) \
-              .stream()
-#else
-#define NON_FATAL_IF(condition)                                              \
-  condition ? (void)0                                                        \
-            : ::c10::LoggerVoidify() &                                       \
-          ::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_FATAL, false) \
-              .stream()
-#endif
-#endif
-
-// Binary comparison check macros
-#define TORCH_CHECK_OP(val1, val2, op)                                      \
-  NON_FATAL_IF(((val1)op(val2)))                                            \
-      << "Check failed: " #val1 " " #op " " #val2 " (" << (val1) << " vs. " \
-      << (val2) << "). "
-
-#define TORCH_DCHECK_OP(val1, val2, op)                                       \
-  FATAL_IF(((val1)op(val2))) << "Check failed: " #val1 " " #op " " #val2 " (" \
-                             << (val1) << " vs. " << (val2) << "). "
-
-#define TORCH_CHECK_EQ(val1, val2) TORCH_CHECK_OP(val1, val2, ==)
-#define TORCH_CHECK_NE(val1, val2) TORCH_CHECK_OP(val1, val2, !=)
-#define TORCH_CHECK_LE(val1, val2) TORCH_CHECK_OP(val1, val2, <=)
-#define TORCH_CHECK_LT(val1, val2) TORCH_CHECK_OP(val1, val2, <)
-#define TORCH_CHECK_GE(val1, val2) TORCH_CHECK_OP(val1, val2, >=)
-#define TORCH_CHECK_GT(val1, val2) TORCH_CHECK_OP(val1, val2, >)
-
-// Debug versions of TORCH_CHECK_OP macros
-#ifndef NDEBUG
-#define TORCH_DCHECK_EQ(val1, val2) TORCH_DCHECK_OP(val1, val2, ==)
-#define TORCH_DCHECK_NE(val1, val2) TORCH_DCHECK_OP(val1, val2, !=)
-#define TORCH_DCHECK_LE(val1, val2) TORCH_DCHECK_OP(val1, val2, <=)
-#define TORCH_DCHECK_LT(val1, val2) TORCH_DCHECK_OP(val1, val2, <)
-#define TORCH_DCHECK_GE(val1, val2) TORCH_DCHECK_OP(val1, val2, >=)
-#define TORCH_DCHECK_GT(val1, val2) TORCH_DCHECK_OP(val1, val2, >)
-#else // !NDEBUG
-// Optimized versions - generate no code
-#define TORCH_DCHECK_EQ(val1, val2) \
-  while (false)                     \
-  TORCH_DCHECK_OP(val1, val2, ==)
-#define TORCH_DCHECK_NE(val1, val2) \
-  while (false)                     \
-  TORCH_DCHECK_OP(val1, val2, !=)
-#define TORCH_DCHECK_LE(val1, val2) \
-  while (false)                     \
-  TORCH_DCHECK_OP(val1, val2, <=)
-#define TORCH_DCHECK_LT(val1, val2) \
-  while (false)                     \
-  TORCH_DCHECK_OP(val1, val2, <)
-#define TORCH_DCHECK_GE(val1, val2) \
-  while (false)                     \
-  TORCH_DCHECK_OP(val1, val2, >=)
-#define TORCH_DCHECK_GT(val1, val2) \
-  while (false)                     \
-  TORCH_DCHECK_OP(val1, val2, >)
-#endif // NDEBUG
-
-// Null pointer check macro
-#define TORCH_CHECK_NOTNULL(val) \
-  ::c10::CheckNotNull(__FILE__, __LINE__, #val, (val), false)
-
-#ifndef NDEBUG
-#define TORCH_DCHECK_NOTNULL(val) \
-  ::c10::CheckNotNull(__FILE__, __LINE__, #val, (val), true)
-#else // !NDEBUG
-#define TORCH_DCHECK_NOTNULL(val) \
-  while (false)                   \
-  TORCH_CHECK_NOTNULL(val)
-#endif // NDEBUG
-
 // ----------------------------------------------------------------------------
 // Deprecated macros
 // ----------------------------------------------------------------------------
--- a/c10/util/Logging.cpp
+++ b/c10/util/Logging.cpp
@ -291,32 +291,6 @@ namespace c10 {
 using fLB::FLAGS_logtostderr;
 using fLI::FLAGS_minloglevel;
 using fLI::FLAGS_v;
-
-MessageLogger::MessageLogger(
-    const char* file,
-    int line,
-    int severity,
-    bool exit_on_fatal)
-    : stream_(), severity_(severity), exit_on_fatal_(exit_on_fatal) {}
-
-MessageLogger::~MessageLogger() noexcept(false) {
-  if (severity_ == ::google::GLOG_FATAL) {
-    DealWithFatal();
-  }
-}
-
-std::stringstream& MessageLogger::stream() {
-  return stream_;
-}
-
-void MessageLogger::DealWithFatal() {
-  if (exit_on_fatal_) {
-    LOG(FATAL) << stream_.str();
-  } else {
-    throw c10::Error(stream_.str(), nullptr, nullptr);
-  }
-}
-
 } // namespace c10

 C10_DEFINE_int(
@ -438,16 +412,17 @@ void ShowLogInfoToStderr() {
  FLAGS_caffe2_log_level = GLOG_INFO;
 }

-MessageLogger::MessageLogger(
-    const char* file,
-    int line,
-    int severity,
-    bool exit_on_fatal)
-    : severity_(severity), exit_on_fatal_(exit_on_fatal) {
+MessageLogger::MessageLogger(const char* file, int line, int severity)
+    : severity_(severity) {
  if (severity_ < FLAGS_caffe2_log_level) {
    // Nothing needs to be logged.
    return;
  }
+#ifdef ANDROID
+  tag_ = "native";
+#else // !ANDROID
+  tag_ = "";
+#endif // ANDROID

  time_t rawtime = 0;
  time(&rawtime);
@ -483,7 +458,7 @@ MessageLogger::MessageLogger(
 }

 // Output the contents of the stream to the proper channel on destruction.
-MessageLogger::~MessageLogger() noexcept(false) {
+MessageLogger::~MessageLogger() {
  if (severity_ < FLAGS_caffe2_log_level) {
    // Nothing needs to be logged.
    return;
@ -523,18 +498,6 @@ MessageLogger::~MessageLogger() noexcept(false) {
  }
 }

-std::stringstream& MessageLogger::stream() {
-  return stream_;
-}
-
-void MessageLogger::DealWithFatal() {
-  if (exit_on_fatal_) {
-    abort();
-  } else {
-    throw c10::Error(stream_.str(), nullptr, nullptr);
-  }
-}
-
 } // namespace c10

 #endif // !C10_USE_GLOG
--- a/c10/util/logging_common.h
+++ b/c10/util/logging_common.h
@ -1,74 +0,0 @@
-#ifndef C10_UTIL_LOGGING_COMMON_H_
-#define C10_UTIL_LOGGING_COMMON_H_
-
-#include <c10/macros/Export.h>
-#include <sstream>
-
-namespace c10 {
-
-// MessageLogger that throws exceptions instead of aborting (glog version)
-// or logs and may abort (non-glog version).
-class C10_API MessageLogger {
- public:
-  MessageLogger(
-      const char* file,
-      int line,
-      int severity,
-      bool exit_on_fatal = true);
-  ~MessageLogger() noexcept(false);
-
-  // Return the stream associated with the logger object.
-  std::stringstream& stream();
-
- private:
-  // When there is a fatal log, and fatal == true, we abort
-  // otherwise, we throw.
-  void DealWithFatal();
-
-#if defined(ANDROID) && !defined(C10_USE_GLOG)
-  const char* tag_{"native"};
-#endif
-  std::stringstream stream_;
-  int severity_;
-  bool exit_on_fatal_;
-};
-
-// This class is used to explicitly ignore values in the conditional
-// logging macros. This avoids compiler warnings like "value computed
-// is not used" and "statement has no effect".
-class C10_API LoggerVoidify {
- public:
-  LoggerVoidify() = default;
-  // This has to be an operator with a precedence lower than << but
-  // higher than ?:
-  void operator&(const std::ostream& s [[maybe_unused]]) {}
-};
-
-// Forward declarations for CheckNotNull functions
-template <typename T>
-T& CheckNotNullCommon(
-    const char* file,
-    int line,
-    const char* names,
-    T& t,
-    bool fatal = true);
-
-template <typename T>
-T* CheckNotNull(
-    const char* file,
-    int line,
-    const char* names,
-    T* t,
-    bool fatal = true);
-
-template <typename T>
-T& CheckNotNull(
-    const char* file,
-    int line,
-    const char* names,
-    T& t,
-    bool fatal = true);
-
-} // namespace c10
-
-#endif // C10_UTIL_LOGGING_COMMON_H_
--- a/c10/util/logging_is_google_glog.h
+++ b/c10/util/logging_is_google_glog.h
@ -47,53 +47,57 @@ INSTANTIATE_FOR_CONTAINER(set)

 #endif

-#include <c10/util/logging_common.h>
 #include <glog/logging.h>

-namespace c10 {
+// Additional macros on top of glog
+#define TORCH_CHECK_EQ(val1, val2) CHECK_EQ(val1, val2)
+#define TORCH_CHECK_NE(val1, val2) CHECK_NE(val1, val2)
+#define TORCH_CHECK_LE(val1, val2) CHECK_LE(val1, val2)
+#define TORCH_CHECK_LT(val1, val2) CHECK_LT(val1, val2)
+#define TORCH_CHECK_GE(val1, val2) CHECK_GE(val1, val2)
+#define TORCH_CHECK_GT(val1, val2) CHECK_GT(val1, val2)

-[[noreturn]] void ThrowEnforceNotMet(
-    const char* file,
-    const int line,
-    const char* condition,
-    const std::string& msg,
-    const void* caller);
+#ifndef NDEBUG
+#define TORCH_DCHECK_EQ(val1, val2) DCHECK_EQ(val1, val2)
+#define TORCH_DCHECK_NE(val1, val2) DCHECK_NE(val1, val2)
+#define TORCH_DCHECK_LE(val1, val2) DCHECK_LE(val1, val2)
+#define TORCH_DCHECK_LT(val1, val2) DCHECK_LT(val1, val2)
+#define TORCH_DCHECK_GE(val1, val2) DCHECK_GE(val1, val2)
+#define TORCH_DCHECK_GT(val1, val2) DCHECK_GT(val1, val2)
+#else // !NDEBUG
+// These versions generate no code in optimized mode.
+#define TORCH_DCHECK_EQ(val1, val2) \
+  while (false)                     \
+  DCHECK_EQ(val1, val2)
+#define TORCH_DCHECK_NE(val1, val2) \
+  while (false)                     \
+  DCHECK_NE(val1, val2)
+#define TORCH_DCHECK_LE(val1, val2) \
+  while (false)                     \
+  DCHECK_LE(val1, val2)
+#define TORCH_DCHECK_LT(val1, val2) \
+  while (false)                     \
+  DCHECK_LT(val1, val2)
+#define TORCH_DCHECK_GE(val1, val2) \
+  while (false)                     \
+  DCHECK_GE(val1, val2)
+#define TORCH_DCHECK_GT(val1, val2) \
+  while (false)                     \
+  DCHECK_GT(val1, val2)
+#endif // NDEBUG

-template <typename T>
-T& CheckNotNullCommon(
-    const char* file,
-    int line,
-    const char* names,
-    T& t,
-    bool fatal) {
-  if (t == nullptr) {
-    MessageLogger(file, line, ::google::GLOG_FATAL, fatal).stream()
-        << "Check failed: '" << names << "' must be non NULL. ";
-  }
-  return t;
-}
+// Check that a pointer is not null.
+#define TORCH_CHECK_NOTNULL(val) CHECK_NOTNULL(val)

-template <typename T>
-T* CheckNotNull(
-    const char* file,
-    int line,
-    const char* names,
-    T* t,
-    bool fatal) {
-  return CheckNotNullCommon(file, line, names, t, fatal);
-}
-
-template <typename T>
-T& CheckNotNull(
-    const char* file,
-    int line,
-    const char* names,
-    T& t,
-    bool fatal) {
-  return CheckNotNullCommon(file, line, names, t, fatal);
-}
-
-} // namespace c10
+#ifndef NDEBUG
+// Debug only version of TORCH_CHECK_NOTNULL
+#define TORCH_DCHECK_NOTNULL(val) DCHECK_NOTNULL(val)
+#else // !NDEBUG
+// Optimized version - generates no code.
+#define TORCH_DCHECK_NOTNULL(val) \
+  while (false)                   \
+  DCHECK_NOTNULL(val)
+#endif // NDEBUG

 // Log with source location information override (to be used in generic
 // warning/error handlers implemented as functions, not macros)
--- a/c10/util/logging_is_not_google_glog.h
+++ b/c10/util/logging_is_not_google_glog.h
@ -13,7 +13,6 @@
 #include <vector>

 #include <c10/util/Flags.h>
-#include <c10/util/logging_common.h>

 const char CAFFE2_SEVERITY_PREFIX[] = "FEWIV";

@ -25,40 +24,61 @@ const int GLOG_ERROR = 2;
 const int GLOG_WARNING = 1;
 const int GLOG_INFO = 0;

+class C10_API MessageLogger {
+ public:
+  MessageLogger(const char* file, int line, int severity);
+  ~MessageLogger();
+  // Return the stream associated with the logger object.
+  std::stringstream& stream() {
+    return stream_;
+  }
+
+ private:
+  // When there is a fatal log, we simply abort.
+  void DealWithFatal() {
+    abort();
+  }
+
+  const char* tag_;
+  std::stringstream stream_;
+  int severity_;
+};
+
+// This class is used to explicitly ignore values in the conditional
+// logging macros.  This avoids compiler warnings like "value computed
+// is not used" and "statement has no effect".
+class C10_API LoggerVoidify {
+ public:
+  LoggerVoidify() = default;
+  // This has to be an operator with a precedence lower than << but
+  // higher than ?:
+  void operator&(const std::ostream& s [[maybe_unused]]) {}
+};
+
+// Log a message and terminate.
+template <class T>
+void LogMessageFatal(const char* file, int line, const T& message) {
+  MessageLogger(file, line, GLOG_FATAL).stream() << message;
+}
+
 // Helpers for TORCH_CHECK_NOTNULL(). Two are necessary to support both raw
 // pointers and smart pointers.
 template <typename T>
-T& CheckNotNullCommon(
-    const char* file,
-    int line,
-    const char* names,
-    T& t,
-    bool fatal) {
+T& CheckNotNullCommon(const char* file, int line, const char* names, T& t) {
  if (t == nullptr) {
-    MessageLogger(file, line, GLOG_FATAL, fatal).stream()
-        << "Check failed: '" << names << "' must be non NULL. ";
+    LogMessageFatal(file, line, std::string(names));
  }
  return t;
 }

 template <typename T>
-T* CheckNotNull(
-    const char* file,
-    int line,
-    const char* names,
-    T* t,
-    bool fatal) {
-  return CheckNotNullCommon(file, line, names, t, fatal);
+T* CheckNotNull(const char* file, int line, const char* names, T* t) {
+  return CheckNotNullCommon(file, line, names, t);
 }

 template <typename T>
-T& CheckNotNull(
-    const char* file,
-    int line,
-    const char* names,
-    T& t,
-    bool fatal) {
-  return CheckNotNullCommon(file, line, names, t, fatal);
+T& CheckNotNull(const char* file, int line, const char* names, T& t) {
+  return CheckNotNullCommon(file, line, names, t);
 }
 } // namespace c10

@ -116,6 +136,65 @@ static_assert(
          ::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_##n).stream()
 #endif // NDEBUG

+#define TORCH_CHECK_OP(val1, val2, op)                                        \
+  FATAL_IF(((val1)op(val2))) << "Check failed: " #val1 " " #op " " #val2 " (" \
+                             << (val1) << " vs. " << (val2) << ") "
+
+// TORCH_CHECK_OP macro definitions
+#define TORCH_CHECK_EQ(val1, val2) TORCH_CHECK_OP(val1, val2, ==)
+#define TORCH_CHECK_NE(val1, val2) TORCH_CHECK_OP(val1, val2, !=)
+#define TORCH_CHECK_LE(val1, val2) TORCH_CHECK_OP(val1, val2, <=)
+#define TORCH_CHECK_LT(val1, val2) TORCH_CHECK_OP(val1, val2, <)
+#define TORCH_CHECK_GE(val1, val2) TORCH_CHECK_OP(val1, val2, >=)
+#define TORCH_CHECK_GT(val1, val2) TORCH_CHECK_OP(val1, val2, >)
+
+#ifndef NDEBUG
+// Debug only versions of TORCH_CHECK_OP macros.
+#define TORCH_DCHECK_EQ(val1, val2) TORCH_CHECK_OP(val1, val2, ==)
+#define TORCH_DCHECK_NE(val1, val2) TORCH_CHECK_OP(val1, val2, !=)
+#define TORCH_DCHECK_LE(val1, val2) TORCH_CHECK_OP(val1, val2, <=)
+#define TORCH_DCHECK_LT(val1, val2) TORCH_CHECK_OP(val1, val2, <)
+#define TORCH_DCHECK_GE(val1, val2) TORCH_CHECK_OP(val1, val2, >=)
+#define TORCH_DCHECK_GT(val1, val2) TORCH_CHECK_OP(val1, val2, >)
+#else // !NDEBUG
+// These versions generate no code in optimized mode.
+#define TORCH_DCHECK_EQ(val1, val2) \
+  while (false)                     \
+  TORCH_CHECK_OP(val1, val2, ==)
+#define TORCH_DCHECK_NE(val1, val2) \
+  while (false)                     \
+  TORCH_CHECK_OP(val1, val2, !=)
+#define TORCH_DCHECK_LE(val1, val2) \
+  while (false)                     \
+  TORCH_CHECK_OP(val1, val2, <=)
+#define TORCH_DCHECK_LT(val1, val2) \
+  while (false)                     \
+  TORCH_CHECK_OP(val1, val2, <)
+#define TORCH_DCHECK_GE(val1, val2) \
+  while (false)                     \
+  TORCH_CHECK_OP(val1, val2, >=)
+#define TORCH_DCHECK_GT(val1, val2) \
+  while (false)                     \
+  TORCH_CHECK_OP(val1, val2, >)
+#endif // NDEBUG
+
+// Check that a pointer is not null.
+#define TORCH_CHECK_NOTNULL(val) \
+  ::c10::CheckNotNull(           \
+      __FILE__, __LINE__, "Check failed: '" #val "' Must be non NULL", (val))
+
+#ifndef NDEBUG
+// Debug only version of TORCH_CHECK_NOTNULL
+#define TORCH_DCHECK_NOTNULL(val) \
+  ::c10::CheckNotNull(            \
+      __FILE__, __LINE__, "Check failed: '" #val "' Must be non NULL", (val))
+#else // !NDEBUG
+// Optimized version - generates no code.
+#define TORCH_DCHECK_NOTNULL(val) \
+  while (false)                   \
+  TORCH_CHECK_NOTNULL(val)
+#endif // NDEBUG
+
 // ---------------------- Support for std objects --------------------------
 // These are adapted from glog to support a limited set of logging capability
 // for STL objects.
--- a/c10/xpu/XPUCachingAllocator.cpp
+++ b/c10/xpu/XPUCachingAllocator.cpp
@ -926,14 +926,15 @@ class DeviceCachingAllocator {
          (release_cached_blocks() && alloc_block(params, true));
    }
    if (!block_found) {
-      const auto& raw_device = c10::xpu::get_raw_device(device);
-      const auto device_total =
-          raw_device.get_info<sycl::info::device::global_mem_size>();
+      c10::xpu::DeviceProp device_prop;
+      c10::xpu::get_device_properties(&device_prop, device);
+      auto device_total = device_prop.global_mem_size;
      // Estimate the available device memory when the SYCL runtime does not
      // support the corresponding aspect (ext_intel_free_memory).
-      size_t device_free = device_total -
+      size_t device_free = device_prop.global_mem_size -
          stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)]
              .current;
+      auto& raw_device = c10::xpu::get_raw_device(device);
      // TODO: Remove the aspect check once the SYCL runtime bug is fixed on
      // affected devices.
      if (raw_device.has(sycl::aspect::ext_intel_free_memory)) {
@ -1051,37 +1052,21 @@ class DeviceCachingAllocator {
    }
  }

-  std::pair<size_t, size_t> getMemoryInfo() {
-    const auto& device = c10::xpu::get_raw_device(device_index);
-    const size_t total = device.get_info<sycl::info::device::global_mem_size>();
-    TORCH_CHECK(
-        device.has(sycl::aspect::ext_intel_free_memory),
-        "The device (",
-        device.get_info<sycl::info::device::name>(),
-        ") doesn't support querying the available free memory. ",
-        "You can file an issue at https://github.com/pytorch/pytorch/issues ",
-        "to help us prioritize its implementation.");
-    const size_t free =
-        device.get_info<sycl::ext::intel::info::device::free_memory>();
-    return {free, total};
-  }
-
  double getMemoryFraction() {
    if (!set_fraction) {
      return 1.0;
    }

-    const auto device_total =
-        xpu::get_raw_device(device_index)
-            .get_info<sycl::info::device::global_mem_size>();
+    c10::xpu::DeviceProp device_prop;
+    c10::xpu::get_device_properties(&device_prop, device_index);
    return static_cast<double>(allowed_memory_maximum) /
-        static_cast<double>(device_total);
+        static_cast<double>(device_prop.global_mem_size);
  }

  void setMemoryFraction(double fraction) {
-    const auto device_total =
-        xpu::get_raw_device(device_index)
-            .get_info<sycl::info::device::global_mem_size>();
+    c10::xpu::DeviceProp device_prop;
+    c10::xpu::get_device_properties(&device_prop, device_index);
+    auto device_total = device_prop.global_mem_size;
    allowed_memory_maximum = static_cast<size_t>(fraction * device_total);
    set_fraction = true;
  }
@ -1255,11 +1240,6 @@ class XPUAllocator : public DeviceAllocator {
        c10::xpu::get_raw_device(dev_to_access));
  }

-  std::pair<size_t, size_t> getMemoryInfo(DeviceIndex device) override {
-    assertValidDevice(device);
-    return device_allocators[device]->getMemoryInfo();
-  }
-
  double getMemoryFraction(DeviceIndex device) {
    assertValidDevice(device);
    return device_allocators[device]->getMemoryFraction();
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -1941,7 +1941,6 @@ if(BUILD_TEST)
    foreach(test_src ${Caffe2_XPU_TEST_SRCS})
      get_filename_component(test_name ${test_src} NAME_WE)
      add_executable(${test_name} "${test_src}")
-      torch_compile_options(${test_name})
      target_link_libraries(${test_name} torch_library gtest_main)
      target_include_directories(${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
      target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
--- a/caffe2/perfkernels/batch_box_cox_vec.h
+++ b/caffe2/perfkernels/batch_box_cox_vec.h
@ -73,19 +73,6 @@ void box_cox_zero_lambda(
  }
 }

-template <typename T>
-at::vec::Vectorized<T> box_cox_nonzero_lambda_impl(
-    at::vec::Vectorized<T> data,
-    at::vec::Vectorized<T> lambda1,
-    at::vec::Vectorized<T> lambda2,
-    at::vec::Vectorized<T> k_eps) {
-  auto sum = data + lambda2;
-  auto max = at::vec::max(sum, k_eps);
-  auto lambda_over_1 = at::vec::fast_recieprocal(lambda1);
-  auto pow = max.pow(lambda1);
-  return at::vec::fmsub(pow, lambda_over_1, lambda_over_1);
-}
-
 template <typename T>
 void box_cox_nonzero_lambda(
    int64_t D,
@ -101,18 +88,21 @@ void box_cox_nonzero_lambda(
  auto k_eps_vec = Vec(k_eps);
  for(; j + VLEN < D; j += VLEN) {
    auto data = Vec::loadu(data_ptr + j);
-    auto lambda1 = Vec::loadu(lambda1_ptr + j);
    auto lambda2 = Vec::loadu(lambda2_ptr + j);
-    auto res = box_cox_nonzero_lambda_impl(data, lambda1, lambda2, k_eps_vec);
+    auto sum = data + lambda2;
+    auto max = at::vec::max(sum, k_eps_vec);
+    auto lambda1 = Vec::loadu(lambda1_ptr + j);
+    auto lambda_over_1 = at::vec::fast_recieprocal(lambda1);
+    auto pow = max.pow(lambda1);
+    auto res = at::vec::fmsub(pow, lambda_over_1, lambda_over_1);
    res.store(out + j);
  }
-  if (j < D) {
-    auto remaining = D - j;
-    auto data = Vec::loadu(data_ptr + j, remaining);
-    auto lambda1 = Vec::loadu(lambda1_ptr + j, remaining);
-    auto lambda2 = Vec::loadu(lambda2_ptr + j, remaining);
-    auto res = box_cox_nonzero_lambda_impl(data, lambda1, lambda2, k_eps_vec);
-    res.store(out + j, remaining);
+  for ( ;j < D; ++j) {
+    auto sum = data_ptr[j] + lambda2_ptr[j];
+    auto max = std::max(sum, k_eps);
+    auto lambda_over_1 = at::vec::fast_recieprocal(lambda1_ptr[j]);
+    auto pow = std::pow(max, lambda1_ptr[j]);
+    out[j] = pow * lambda_over_1 - lambda_over_1;
  }
 }
 #else
--- a/docs/source/accelerator.md
+++ b/docs/source/accelerator.md
@ -40,7 +40,6 @@
    :nosignatures:

     empty_cache
-     get_memory_info
     max_memory_allocated
     max_memory_reserved
     memory_allocated
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -206,41 +206,6 @@ templates_path = [
    os.path.join(os.path.dirname(pytorch_sphinx_theme2.__file__), "templates"),
 ]
 # TODO: document these and remove them from here.
-# Fixes the duplicated
-autosummary_filename_map = {
-    "torch.nn.utils.prune.identity": "torch.nn.utils.prune.identity_function",
-    "torch.nn.utils.prune.Identity": "torch.nn.utils.prune.Identity_class",
-    "torch.optim.adamw.adamw": "torch.optim.adamw.adamw_function",
-    "torch.optim.adamw.AdamW": "torch.optim.adamw.AdamW_class",
-    "torch.optim.asgd.asgd": "torch.optim.asgd.asgd_function",
-    "torch.optim.asgd.ASGD": "torch.optim.asgd.ASGD_class",
-    "torch.optim.nadam.nadam": "torch.optim.nadam.nadam_function",
-    "torch.optim.nadam.NAdam": "torch.optim.nadam.NAdam_class",
-    "torch.optim.radam.radam": "torch.optim.radam.radam_function",
-    "torch.optim.radam.RAdam": "torch.optim.radam.RAdam_class",
-    "torch.optim.rmsprop.rmsprop": "torch.optim.rmsprop.rmsprop_function",
-    "torch.optim.rmsprop.RMSprop": "torch.optim.rmsprop.RMSprop_class",
-    "torch.optim.rprop.rprop": "torch.optim.rprop.rprop_function",
-    "torch.optim.rprop.Rprop": "torch.optim.rprop.Rprop_class",
-    "torch.optim.sgd.sgd": "torch.optim.sgd.sgd_function",
-    "torch.optim.sgd.SGD": "torch.optim.sgd.SGD_class",
-    "torch.optim.adadelta.adadelta": "torch.optim.adadelta.adadelta_function",
-    "torch.optim.adadelta.Adadelta": "torch.optim.adadelta.Adadelta_class",
-    "torch.optim.adagrad.adagrad": "torch.optim.adagrad.adagrad_function",
-    "torch.optim.adagrad.Adagrad": "torch.optim.adagrad.Adagrad_class",
-    "torch.optim.adam.adam": "torch.optim.adam.adam_function",
-    "torch.optim.adam.Adam": "torch.optim.adam.Adam_class",
-    "torch.optim.adamax.adamax": "torch.optim.adamax.adamax_function",
-    "torch.optim.adamax.Adamax": "torch.optim.adamax.Adamax_class",
-    "torch.mtia.stream": "torch.mtia.stream_function",
-    "torch.mtia.Stream": "torch.mtia.Stream_class",
-    "torch.cpu.stream": "torch.cpu.stream_function",
-    "torch.cpu.Stream": "torch.cpu.Stream_class",
-    "torch.cuda.stream": "torch.cuda.stream_function",
-    "torch.cuda.Stream": "torch.cuda.Stream_class",
-    "torch.xpu.stream": "torch.xpu.stream_function",
-    "torch.xpu.Stream": "torch.xpu.Stream_class",
-}

 coverage_ignore_functions = [
    # torch
@ -382,6 +347,20 @@ coverage_ignore_functions = [
    # torch.ao.quantization.backend_config.tensorrt
    "get_tensorrt_backend_config",
    "get_tensorrt_backend_config_dict",
+    # torch.ao.quantization.backend_config.utils
+    "entry_to_pretty_str",
+    "get_fused_module_classes",
+    "get_fuser_method_mapping",
+    "get_fusion_pattern_to_extra_inputs_getter",
+    "get_fusion_pattern_to_root_node_getter",
+    "get_module_to_qat_module",
+    "get_pattern_to_dtype_configs",
+    "get_pattern_to_input_type_to_index",
+    "get_qat_module_classes",
+    "get_root_module_to_quantized_reference_module",
+    "pattern_to_human_readable",
+    "remove_boolean_dispatch_from_name",
+    # torch.ao.quantization.backend_config.x86
    "get_x86_backend_config",
    # torch.ao.quantization.fuse_modules
    "fuse_known_modules",
@ -412,6 +391,25 @@ coverage_ignore_functions = [
    "insert_observers_for_model",
    "prepare",
    "propagate_dtypes_for_known_nodes",
+    # torch.ao.quantization.fx.utils
+    "all_node_args_except_first",
+    "all_node_args_have_no_tensors",
+    "assert_and_get_unique_device",
+    "collect_producer_nodes",
+    "create_getattr_from_value",
+    "create_node_from_old_node_preserve_meta",
+    "get_custom_module_class_keys",
+    "get_linear_prepack_op_for_dtype",
+    "get_new_attr_name_with_prefix",
+    "get_non_observable_arg_indexes_and_types",
+    "get_qconv_prepack_op",
+    "get_skipped_module_name_and_classes",
+    "graph_module_from_producer_nodes",
+    "maybe_get_next_module",
+    "node_arg_is_bias",
+    "node_arg_is_weight",
+    "return_arg_list",
+    # torch.ao.quantization.pt2e.graph_utils
    "bfs_trace_with_node_process",
    "find_sequential_partitions",
    "get_equivalent_types",
@ -827,10 +825,80 @@ coverage_ignore_functions = [
    "get_latency_of_one_partition",
    "get_latency_of_partitioned_graph",
    "get_partition_to_latency_mapping",
+    # torch.fx.experimental.proxy_tensor
+    "decompose",
+    "disable_autocast_cache",
+    "disable_proxy_modes_tracing",
+    "dispatch_trace",
+    "extract_val",
+    "fake_signature",
+    "fetch_sym_proxy",
+    "fetch_object_proxy",
+    "get_innermost_proxy_mode",
+    "get_isolated_graphmodule",
+    "get_proxy_slot",
+    "get_torch_dispatch_modes",
+    "has_proxy_slot",
+    "is_sym_node",
+    "maybe_handle_decomp",
+    "proxy_call",
+    "set_meta",
+    "set_original_aten_op",
+    "set_proxy_slot",
+    "snapshot_fake",
+    "thunkify",
+    "track_tensor",
+    "track_tensor_tree",
+    "wrap_key",
+    "wrapper_and_args_for_make_fx",
+    # torch.fx.experimental.recording
    "record_shapeenv_event",
    "replay_shape_env_events",
    "shape_env_check_state_equal",
+    # torch.fx.experimental.sym_node
+    "ceil_impl",
+    "floor_ceil_helper",
+    "floor_impl",
+    "method_to_operator",
+    "sympy_is_channels_last_contiguous_2d",
+    "sympy_is_channels_last_contiguous_3d",
+    "sympy_is_channels_last_strides_2d",
+    "sympy_is_channels_last_strides_3d",
+    "sympy_is_channels_last_strides_generic",
+    "sympy_is_contiguous",
+    "sympy_is_contiguous_generic",
+    "to_node",
+    "wrap_node",
    "sym_sqrt",
+    # torch.fx.experimental.symbolic_shapes
+    "bind_symbols",
+    "cast_symbool_to_symint_guardless",
+    "create_contiguous",
+    "error",
+    "eval_guards",
+    "eval_is_non_overlapping_and_dense",
+    "expect_true",
+    "find_symbol_binding_fx_nodes",
+    "free_symbols",
+    "free_unbacked_symbols",
+    "fx_placeholder_targets",
+    "fx_placeholder_vals",
+    "guard_bool",
+    "guard_float",
+    "guard_int",
+    "guard_scalar",
+    "has_hint",
+    "has_symbolic_sizes_strides",
+    "is_channels_last_contiguous_2d",
+    "is_channels_last_contiguous_3d",
+    "is_channels_last_strides_2d",
+    "is_channels_last_strides_3d",
+    "is_contiguous",
+    "is_non_overlapping_and_dense_indicator",
+    "is_nested_int",
+    "is_symbol_binding_fx_node",
+    "is_symbolic",
+    # torch.fx.experimental.unification.core
    "reify",
    # torch.fx.experimental.unification.match
    "edge",
@ -868,6 +936,24 @@ coverage_ignore_functions = [
    "reverse_dict",
    # torch.fx.experimental.unification.multipledispatch.variadic
    "isvariadic",
+    # torch.fx.experimental.unification.unification_tools
+    "assoc",
+    "assoc_in",
+    "dissoc",
+    "first",
+    "get_in",
+    "getter",
+    "groupby",
+    "itemfilter",
+    "itemmap",
+    "keyfilter",
+    "keymap",
+    "merge",
+    "merge_with",
+    "update_in",
+    "valfilter",
+    "valmap",
+    # torch.fx.experimental.unification.utils
    "freeze",
    "hashable",
    "raises",
@ -1308,8 +1394,319 @@ coverage_ignore_functions = [
    # torch.onnx.symbolic_opset7
    "max",
    "min",
+    # torch.onnx.symbolic_opset8
+    "addmm",
+    "bmm",
+    "empty",
+    "empty_like",
+    "flatten",
+    "full",
+    "full_like",
+    "gt",
+    "lt",
+    "matmul",
+    "mm",
+    "ones",
+    "ones_like",
+    "prelu",
+    "repeat",
+    "zeros",
+    "zeros_like",
+    # torch.onnx.symbolic_opset9
+    "abs",
+    "acos",
+    "adaptive_avg_pool1d",
+    "adaptive_avg_pool2d",
+    "adaptive_avg_pool3d",
+    "adaptive_max_pool1d",
+    "adaptive_max_pool2d",
+    "adaptive_max_pool3d",
+    "add",
+    "addcmul",
+    "addmm",
+    "alias",
+    "amax",
+    "amin",
+    "aminmax",
+    "arange",
+    "argmax",
+    "argmin",
+    "as_strided",
+    "as_tensor",
+    "asin",
+    "atan",
+    "atan2",
+    "avg_pool1d",
+    "avg_pool2d",
+    "avg_pool3d",
+    "baddbmm",
+    "batch_norm",
+    "bernoulli",
+    "bitwise_not",
+    "bitwise_or",
+    "bmm",
+    "broadcast_tensors",
+    "broadcast_to",
+    "bucketize",
+    "cat",
+    "cdist",
+    "ceil",
+    "clamp",
+    "clamp_max",
+    "clamp_min",
+    "clone",
+    "constant_pad_nd",
+    "contiguous",
+    "conv1d",
+    "conv2d",
+    "conv3d",
+    "conv_tbc",
+    "conv_transpose1d",
+    "conv_transpose2d",
+    "conv_transpose3d",
+    "convert_element_type",
+    "convolution",
+    "cos",
+    "cosine_similarity",
+    "cross",
+    "cumsum",
+    "detach",
    "dim",
+    "div",
+    "dot",
+    "dropout",
+    "elu",
+    "embedding",
+    "embedding_bag",
+    "empty",
+    "empty_like",
+    "eq",
+    "erf",
+    "exp",
+    "expand",
+    "expand_as",
+    "eye",
+    "fill",
+    "flatten",
+    "floor",
+    "floor_divide",
+    "floordiv",
+    "frobenius_norm",
+    "full",
+    "full_like",
+    "gather",
+    "ge",
+    "gelu",
+    "get_pool_ceil_padding",
+    "glu",
+    "group_norm",
+    "gru",
+    "gt",
+    "hann_window",
+    "hardshrink",
+    "hardsigmoid",
+    "hardswish",
+    "hardtanh",
+    "index",
+    "index_add",
+    "index_copy",
+    "index_fill",
+    "index_put",
+    "index_select",
+    "instance_norm",
+    "is_floating_point",
+    "is_pinned",
+    "isnan",
+    "item",
+    "kl_div",
+    "layer_norm",
+    "le",
+    "leaky_relu",
+    "lerp",
+    "lift",
+    "linalg_cross",
+    "linalg_matrix_norm",
+    "linalg_norm",
+    "linalg_vector_norm",
+    "linear",
+    "linspace",
+    "log",
+    "log10",
+    "log1p",
+    "log2",
+    "log_sigmoid",
+    "log_softmax",
+    "logical_and",
+    "logical_not",
+    "logical_or",
+    "logical_xor",
+    "logit",
+    "logsumexp",
+    "lstm",
+    "lstm_cell",
+    "lt",
+    "masked_fill",
+    "masked_fill_",
+    "matmul",
+    "max",
+    "max_pool1d",
+    "max_pool1d_with_indices",
+    "max_pool2d",
+    "max_pool2d_with_indices",
+    "max_pool3d",
+    "max_pool3d_with_indices",
+    "maximum",
+    "meshgrid",
+    "min",
+    "minimum",
+    "mish",
+    "mm",
+    "movedim",
+    "mse_loss",
+    "mul",
+    "multinomial",
+    "mv",
+    "narrow",
+    "native_layer_norm",
+    "ne",
+    "neg",
+    "new_empty",
+    "new_full",
+    "new_ones",
+    "new_zeros",
+    "nonzero",
+    "nonzero_numpy",
+    "noop_complex_operators",
+    "norm",
+    "numel",
+    "numpy_T",
+    "one_hot",
+    "ones",
+    "ones_like",
+    "onnx_placeholder",
+    "overload_by_arg_count",
+    "pad",
+    "pairwise_distance",
+    "permute",
+    "pixel_shuffle",
+    "pixel_unshuffle",
+    "pow",
+    "prelu",
+    "prim_constant",
+    "prim_constant_chunk",
+    "prim_constant_split",
+    "prim_data",
+    "prim_device",
+    "prim_dtype",
+    "prim_if",
+    "prim_layout",
+    "prim_list_construct",
+    "prim_list_unpack",
+    "prim_loop",
+    "prim_max",
+    "prim_min",
+    "prim_shape",
+    "prim_tolist",
+    "prim_tuple_construct",
+    "prim_type",
+    "prim_unchecked_cast",
+    "prim_uninitialized",
+    "rand",
+    "rand_like",
+    "randint",
+    "randint_like",
+    "randn",
+    "randn_like",
+    "reciprocal",
+    "reflection_pad",
+    "relu",
+    "relu6",
+    "remainder",
+    "repeat",
+    "repeat_interleave",
+    "replication_pad",
+    "reshape",
+    "reshape_as",
+    "rnn_relu",
+    "rnn_tanh",
+    "roll",
+    "rrelu",
+    "rsqrt",
+    "rsub",
+    "scalar_tensor",
+    "scatter",
+    "scatter_add",
+    "select",
+    "selu",
+    "sigmoid",
+    "sign",
+    "silu",
+    "sin",
+    "size",
+    "slice",
+    "softmax",
+    "softplus",
+    "softshrink",
+    "sort",
+    "split",
+    "split_with_sizes",
+    "sqrt",
+    "square",
+    "squeeze",
+    "stack",
+    "std",
+    "std_mean",
+    "sub",
+    "t",
+    "take",
+    "tan",
+    "tanh",
+    "tanhshrink",
+    "tensor",
+    "threshold",
+    "to",
+    "topk",
+    "transpose",
+    "true_divide",
+    "type_as",
+    "unbind",
+    "unfold",
+    "unsafe_chunk",
+    "unsafe_split",
+    "unsafe_split_with_sizes",
+    "unsqueeze",
+    "unsupported_complex_operators",
+    "unused",
+    "upsample_bilinear2d",
+    "upsample_linear1d",
+    "upsample_nearest1d",
+    "upsample_nearest2d",
+    "upsample_nearest3d",
+    "upsample_trilinear3d",
+    "var",
+    "var_mean",
+    "view",
+    "view_as",
+    "where",
+    "wrap_logical_op_with_cast_to",
+    "wrap_logical_op_with_negation",
+    "zero",
+    "zeros",
+    "zeros_like",
+    # torch.onnx.utils
+    "disable_apex_o2_state_dict_hook",
    "export",
+    "export_to_pretty_string",
+    "exporter_context",
+    "is_in_onnx_export",
+    "model_signature",
+    "register_custom_op_symbolic",
+    "select_model_mode_for_export",
+    "setup_onnx_logging",
+    "unconvertible_ops",
+    "unpack_quantized_tensor",
+    "warn_on_static_input_change",
+    # torch.onnx.verification
    "check_export_model_diff",
    "verify",
    "verify_aten_graph",
@ -1400,6 +1797,32 @@ coverage_ignore_functions = [
    "noop_context_fn",
    "set_checkpoint_early_stop",
    "set_device_states",
+    # torch.utils.collect_env
+    "check_release_file",
+    "get_cachingallocator_config",
+    "get_clang_version",
+    "get_cmake_version",
+    "get_conda_packages",
+    "get_cpu_info",
+    "get_cuda_module_loading_config",
+    "get_cudnn_version",
+    "get_env_info",
+    "get_gcc_version",
+    "get_gpu_info",
+    "get_libc_version",
+    "get_lsb_version",
+    "get_mac_version",
+    "get_nvidia_driver_version",
+    "get_nvidia_smi",
+    "get_os",
+    "get_pip_packages",
+    "get_platform",
+    "get_pretty_env_info",
+    "get_python_platform",
+    "get_running_cuda_version",
+    "get_windows_version",
+    "is_xnnpack_available",
+    "pretty_str",
    # torch.utils.cpp_backtrace
    "get_cpp_backtrace",
    # torch.utils.cpp_extension
@ -1463,6 +1886,52 @@ coverage_ignore_functions = [
    "apply_shuffle_seed",
    "apply_shuffle_settings",
    "get_all_graph_pipes",
+    # torch.utils.flop_counter
+    "addmm_flop",
+    "baddbmm_flop",
+    "bmm_flop",
+    "conv_backward_flop",
+    "conv_flop",
+    "conv_flop_count",
+    "convert_num_with_suffix",
+    "get_shape",
+    "get_suffix_str",
+    "mm_flop",
+    "normalize_tuple",
+    "register_flop_formula",
+    "sdpa_backward_flop",
+    "sdpa_backward_flop_count",
+    "sdpa_flop",
+    "sdpa_flop_count",
+    "shape_wrapper",
+    "transpose_shape",
+    # torch.utils.hipify.hipify_python
+    "add_dim3",
+    "compute_stats",
+    "extract_arguments",
+    "file_add_header",
+    "file_specific_replacement",
+    "find_bracket_group",
+    "find_closure_group",
+    "find_parentheses_group",
+    "fix_static_global_kernels",
+    "get_hip_file_path",
+    "hip_header_magic",
+    "hipify",
+    "is_caffe2_gpu_file",
+    "is_cusparse_file",
+    "is_out_of_place",
+    "is_pytorch_file",
+    "is_special_file",
+    "match_extensions",
+    "matched_files_iter",
+    "openf",
+    "preprocess_file_and_save_result",
+    "preprocessor",
+    "processKernelLaunches",
+    "replace_extern_shared",
+    "replace_math_functions",
+    "str2bool",
    # torch.utils.hooks
    "unserializable_hook",
    "warn_if_has_hooks",
@ -2726,11 +3195,6 @@ autodoc_type_aliases = {
 # Enable overriding of function signatures in the first line of the docstring.
 autodoc_docstring_signature = True

-# Exclude inherited IntEnum methods that have RST formatting issues in their docstrings
-autodoc_default_options = {
-    "exclude-members": "from_bytes, to_bytes",
-}
-
 # -- katex javascript in header
 #
 #    def setup(app):
--- a/docs/source/fx.experimental.md
+++ b/docs/source/fx.experimental.md
@ -12,37 +12,6 @@ These APIs are experimental and subject to change without notice.
 .. autoclass:: torch.fx.experimental.sym_node.DynamicInt
 ```

-## torch.fx.experimental.sym_node
-
-```{eval-rst}
-.. currentmodule:: torch.fx.experimental.sym_node
-```
-
-```{eval-rst}
-.. automodule:: torch.fx.experimental.sym_node
-```
-
-```{eval-rst}
-.. autosummary::
-    :toctree: generated
-    :nosignatures:
-
-    is_channels_last_contiguous_2d
-    is_channels_last_contiguous_3d
-    is_channels_last_strides_2d
-    is_channels_last_strides_3d
-    is_contiguous
-    is_non_overlapping_and_dense_indicator
-    method_to_operator
-    sympy_is_channels_last_contiguous_2d
-    sympy_is_channels_last_contiguous_3d
-    sympy_is_channels_last_strides_2d
-    sympy_is_channels_last_strides_3d
-    sympy_is_channels_last_strides_generic
-    sympy_is_contiguous
-    sympy_is_contiguous_generic
-```
-
 ## torch.fx.experimental.symbolic_shapes

 ```{eval-rst}
@ -100,25 +69,6 @@ These APIs are experimental and subject to change without notice.
    rebind_unbacked
    resolve_unbacked_bindings
    is_accessor_node
-    cast_symbool_to_symint_guardless
-    create_contiguous
-    error
-    eval_guards
-    eval_is_non_overlapping_and_dense
-    find_symbol_binding_fx_nodes
-    free_symbols
-    free_unbacked_symbols
-    fx_placeholder_targets
-    fx_placeholder_vals
-    guard_bool
-    guard_float
-    guard_int
-    guard_scalar
-    has_hint
-    has_symbolic_sizes_strides
-    is_nested_int
-    is_symbol_binding_fx_node
-    is_symbolic
 ```

 ## torch.fx.experimental.proxy_tensor
@ -141,46 +91,4 @@ These APIs are experimental and subject to change without notice.
    get_proxy_mode
    maybe_enable_thunkify
    maybe_disable_thunkify
-    decompose
-    disable_autocast_cache
-    disable_proxy_modes_tracing
-    extract_val
-    fake_signature
-    fetch_object_proxy
-    fetch_sym_proxy
-    has_proxy_slot
-    is_sym_node
-    maybe_handle_decomp
-    proxy_call
-    set_meta
-    set_original_aten_op
-    set_proxy_slot
-    snapshot_fake
 ```
-
-## torch.fx.experimental.unification.unification_tools
-
-```{eval-rst}
-.. currentmodule:: torch.fx.experimental.unification.unification_tools
-```
-
-```{eval-rst}
-.. automodule:: torch.fx.experimental.unification.unification_tools
-```
-
-```{eval-rst}
-.. autosummary::
-    :toctree: generated
-    :nosignatures:
-
-    assoc
-    assoc_in
-    dissoc
-    first
-    keyfilter
-    keymap
-    merge
-    merge_with
-    update_in
-    valfilter
-    valmap
--- a/docs/source/fx.md
+++ b/docs/source/fx.md
@ -1134,6 +1134,7 @@ The set of leaf modules can be customized by overriding
 .. py:module:: torch.fx.experimental.refinement_types
 .. py:module:: torch.fx.experimental.rewriter
 .. py:module:: torch.fx.experimental.schema_type_annotation
+.. py:module:: torch.fx.experimental.sym_node
 .. py:module:: torch.fx.experimental.unification.core
 .. py:module:: torch.fx.experimental.unification.dispatch
 .. py:module:: torch.fx.experimental.unification.match
@ -1143,6 +1144,7 @@ The set of leaf modules can be customized by overriding
 .. py:module:: torch.fx.experimental.unification.multipledispatch.dispatcher
 .. py:module:: torch.fx.experimental.unification.multipledispatch.utils
 .. py:module:: torch.fx.experimental.unification.multipledispatch.variadic
+.. py:module:: torch.fx.experimental.unification.unification_tools
 .. py:module:: torch.fx.experimental.unification.utils
 .. py:module:: torch.fx.experimental.unification.variable
 .. py:module:: torch.fx.experimental.unify_refinements
--- a/docs/source/notes/cuda.rst
+++ b/docs/source/notes/cuda.rst
@ -619,10 +619,6 @@ Available options:
  and reallocate buffers across multiple streams, especially when the capture DAG frequently
  reaches joined frontiers.

-* ``per_process_memory_fraction`` option limits the amount of memory that can be allocated
-  on all the CUDA devices to a specified fraction of the available memory. This is a value
-  between 0 and 1. Attempting to allocate more memory will raise an out of memory error.
-
 .. note::

    Some stats reported by the
--- a/docs/source/notes/libtorch_stable_abi.md
+++ b/docs/source/notes/libtorch_stable_abi.md
@ -46,108 +46,6 @@ These headers are promised to be ABI stable across releases and adhere to a stro
 Unless absolutely necessary, we recommend the high-level C++ API in `torch/csrc/stable`
 which will handle all the rough edges of the C API for the user.

-## Migrating your kernel to the LibTorch stable ABI
-
-If you'd like your kernel to be ABI stable with LibTorch, meaning you'd the ability to build for one version and run on another, your kernel must only use the limited stable ABI. This following section goes through some steps of migrating an existing kernel and APIs we imagine you would need to swap over.
-
-Firstly, instead of registering kernels through `TORCH_LIBRARY`, LibTorch ABI stable kernels must be registered via `STABLE_TORCH_LIBRARY`. Note that, for the time being, implementations registered via `STABLE_TORCH_LIBRARY` must be boxed unlike `TORCH_LIBRARY`. See the simple example below or our docs on [Stack-based APIs](stack-based-apis) for more details. For kernels that are registered via `pybind`, before using the stable ABI, it would be useful to migrate to register them via `TORCH_LIBRARY`.
-
-While previously your kernels might have included APIs from `<torch/*.h>` (for example, `<torch/all.h>`), they are now limited to including from the 3 categories of headers mentioned above (`torch/csrc/stable/*.h`, `torch/headeronly/*.h` and the stable C headers). This means that your extension should no longer use any utilities from the `at::` or `c10::` namespaces but instead use their replacements in `torch::stable` and `torch::headeronly`. To provide a couple examples of the necessary migrations:
- all uses of `at::Tensor` must be replaced with `torch::stable::Tensor`
- all uses of `TORCH_CHECK` must be replaced with `STD_TORCH_CHECK`
- all uses of `at::kCUDA` must be replaced with `torch::headeronly::kCUDA` etc.
- native functions such as `at::pad` must be replaced with `torch::stable::pad`
- native functions that are called as Tensor methods (e.g., `Tensor.pad`) must be replaced with the ATen variant through `torch::stable::pad`.
-
-As mentioned above, the LibTorch stable ABI is still under development. If there is any API or feature you would like to see added to the stable ABI/`torch::headeronly`/`torch::stable`, please file a request through a [new issue on the PyTorch repo](https://github.com/pytorch/pytorch/issues).
-
-Below is a simple example of migrating an existing kernel that uses `TORCH_LIBRARY` to the stable ABI (`TORCH_STABLE_LIBRARY`). For a larger end to end example you can take a look at the FA3 repository. Specifically the diff between [`flash_api.cpp`](https://github.com/Dao-AILab/flash-attention/blob/ad70a007e6287d4f7e766f94bcf2f9a813f20f6b/hopper/flash_api.cpp#L1) and the stable variant [`flash_api_stable.cpp`](https://github.com/Dao-AILab/flash-attention/blob/ad70a007e6287d4f7e766f94bcf2f9a813f20f6b/hopper/flash_api_stable.cpp#L1).
-
-
-### Original Version with `TORCH_LIBRARY`
-
-```cpp
-// original_kernel.cpp - Using TORCH_LIBRARY (not stable ABI)
-#include <torch/torch.h>
-#include <ATen/ATen.h>
-
-namespace myops {
-
-// Simple kernel that adds a scalar value to each element of a tensor
-at::Tensor add_scalar(const at::Tensor& input, double scalar) {
-  TORCH_CHECK(input.scalar_type() == at::kFloat, "Input must be float32");
-
-  return input.add(scalar);
-}
-
-// Register the operator
-TORCH_LIBRARY(myops, m) {
-  m.def("add_scalar(Tensor input, float scalar) -> Tensor", &add_scalar);
-}
-
-// Register the implementation
-TORCH_LIBRARY_IMPL(myops, CompositeExplicitAutograd, m) {
-  m.impl("add_scalar", &add_scalar);
-}
-
-} // namespace myops
-```
-
-### Migrated Version with `STABLE_TORCH_LIBRARY`
-
-```cpp
-// stable_kernel.cpp - Using STABLE_TORCH_LIBRARY (stable ABI)
-
-// (1) Don't include <torch/torch.h> <ATen/ATen.h>
-//     only include APIs from torch/csrc/stable, torch/headeronly and C-shims
-#include <torch/csrc/stable/library.h>
-#include <torch/csrc/stable/tensor_struct.h>
-#include <torch/csrc/stable/ops.h>
-#include <torch/csrc/stable/stableivalue_conversions.h>
-#include <torch/headeronly/core/ScalarType.h>
-#include <torch/headeronly/macros/Macros.h>
-
-namespace myops {
-
-// Simple kernel that adds a scalar value to each element of a tensor
-torch::stable::Tensor add_scalar(const torch::stable::Tensor& input, double scalar) {
-  // (2) use STD_TORCH_CHECK instead of TORCH_CHECK
-  STD_TORCH_CHECK(
-      // (3) use torch::headeronly::kFloat instead of at:kFloat
-      input.scalar_type() == torch::headeronly::kFloat,
-      "Input must be float32");
-
-  // (4) Use stable ops namespace instead of input.add
-  return torch::stable::add(input, scalar);
-}
-
-// (5) Add Boxed wrapper required for STABLE_TORCH_LIBRARY
-void boxed_add_scalar(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  // Extract arguments from stack using `to<T>`
-  auto input = to<torch::stable::Tensor>(stack[0]);
-  auto scalar = to<double>(stack[1]);
-
-  // Call the actual kernel
-  auto result = add_scalar(input, scalar);
-
-  // Put result back on stack using `from()`
-  // Stack slot 0 now holds the return value
-  stack[0] = from(result);
-}
-
-// (6) Register the operator using STABLE_TORCH_LIBRARY
-STABLE_TORCH_LIBRARY(myops, m) {
-  m.def("add_scalar(Tensor input, float scalar) -> Tensor", &boxed_add_scalar);
-}
-
-// (7) Register the implementation using STABLE_TORCH_LIBRARY_IMPL
-STABLE_TORCH_LIBRARY_IMPL(myops, CompositeExplicitAutograd, m) {
-  m.impl("add_scalar", &boxed_add_scalar);
-}
-
-} // namespace myops
-```
-

 ## How are objects passed across the ABI boundary when interacting with the dispatcher?

@ -211,7 +109,6 @@ There are two invariants for the stack:
    a. When calling a stack-based API, you must give owning references to the calling stack and steal references from the returned stack.
    b. When registering your function to be called with a stack, you must steal references from your argument stack and push onto the stack new references.

-(stack-based-apis)=
 ### Stack-based APIs

 The above is relevant in two places:
--- a/docs/source/quantization-support.md
+++ b/docs/source/quantization-support.md
@ -134,23 +134,6 @@ Quantization to work with this as well.
    ObservationType
 ```

-## torch.ao.quantization.backend_config.utils
-```{eval-rst}
-.. currentmodule:: torch.ao.quantization.backend_config.utils
-```
-
-```{eval-rst}
-.. autosummary::
-    :toctree: generated
-    :nosignatures:
-    :template: classtemplate.rst
-
-    entry_to_pretty_str
-    pattern_to_human_readable
-    remove_boolean_dispatch_from_name
-
-```
-
 ## torch.ao.quantization.fx.custom_config

 This module contains a few CustomConfig classes that's used in both eager mode and FX graph mode quantization
@ -171,30 +154,6 @@ This module contains a few CustomConfig classes that's used in both eager mode a
    StandaloneModuleConfigEntry
 ```

-## torch.ao.quantization.fx.utils
-
-```{eval-rst}
-.. currentmodule:: torch.ao.quantization.fx.utils
-```
-
-```{eval-rst}
-.. autosummary::
-    :toctree: generated
-    :nosignatures:
-    :template: classtemplate.rst
-
-    all_node_args_except_first
-    all_node_args_have_no_tensors
-    collect_producer_nodes
-    create_getattr_from_value
-    create_node_from_old_node_preserve_meta
-    graph_module_from_producer_nodes
-    maybe_get_next_module
-    node_arg_is_bias
-    node_arg_is_weight
-    return_arg_list
-```
-
 ## torch.ao.quantization.quantizer

 ```{eval-rst}
@ -294,6 +253,7 @@ regular full-precision tensor.
 .. autosummary::
    :toctree: generated
    :nosignatures:
+    :template: classtemplate.rst

    view
    as_strided
--- a/docs/source/utils.md
+++ b/docs/source/utils.md
@ -19,91 +19,6 @@
    swap_tensors
 ```

-# torch.utils.collect_env
-```{eval-rst}
-.. automodule:: torch.utils.collect_env
-```
-
-```{eval-rst}
-.. currentmodule:: torch.utils.collect_env
-```
-
-```{eval-rst}
-.. autosummary::
-    :toctree: generated
-    :nosignatures:
-
-    check_release_file
-    is_xnnpack_available
-    pretty_str
-```
-
-# torch.utils.flop_counter
-```{eval-rst}
-.. automodule:: torch.utils.flop_counter
-```
-
-```{eval-rst}
-.. currentmodule:: torch.utils.flop_counter
-```
-
-```{eval-rst}
-.. autosummary::
-    :toctree: generated
-    :nosignatures:
-
-    baddbmm_flop
-    bmm_flop
-    conv_backward_flop
-    conv_flop
-    conv_flop_count
-    register_flop_formula
-    sdpa_backward_flop
-    sdpa_backward_flop_count
-    sdpa_flop
-    sdpa_flop_count
-    shape_wrapper
-```
-
-# torch.utils.hipify.hipify_python
-```{eval-rst}
-.. automodule:: torch.utils.hipify.hipify_python
-```
-
-```{eval-rst}
-.. currentmodule:: torch.utils.hipify.hipify_python
-```
-
-```{eval-rst}
-.. autosummary::
-    :toctree: generated
-    :nosignatures:
-
-    compute_stats
-    extract_arguments
-    file_add_header
-    file_specific_replacement
-    find_bracket_group
-    find_closure_group
-    find_parentheses_group
-    fix_static_global_kernels
-    hip_header_magic
-    hipify
-    is_caffe2_gpu_file
-    is_cusparse_file
-    is_out_of_place
-    is_pytorch_file
-    is_special_file
-    openf
-    preprocess_file_and_save_result
-    preprocessor
-    processKernelLaunches
-    replace_extern_shared
-    replace_math_functions
-    str2bool
-```
-
-
 <!-- This module needs to be documented. Adding here in the meantime
 for tracking purposes -->
 ```{eval-rst}
@ -128,6 +43,7 @@ for tracking purposes -->
 .. py:module:: torch.utils.benchmark.utils.valgrind_wrapper.timer_interface
 .. py:module:: torch.utils.bundled_inputs
 .. py:module:: torch.utils.checkpoint
+.. py:module:: torch.utils.collect_env
 .. py:module:: torch.utils.cpp_backtrace
 .. py:module:: torch.utils.cpp_extension
 .. py:module:: torch.utils.data.backward_compatibility
@ -164,8 +80,10 @@ for tracking purposes -->
 .. py:module:: torch.utils.data.sampler
 .. py:module:: torch.utils.dlpack
 .. py:module:: torch.utils.file_baton
+.. py:module:: torch.utils.flop_counter
 .. py:module:: torch.utils.hipify.constants
 .. py:module:: torch.utils.hipify.cuda_to_hip_mappings
+.. py:module:: torch.utils.hipify.hipify_python
 .. py:module:: torch.utils.hipify.version
 .. py:module:: torch.utils.hooks
 .. py:module:: torch.utils.jit.log_extract
--- a/pyproject.toml
+++ b/pyproject.toml
@ -172,9 +172,9 @@ ignore = [
    "SIM102", "SIM103", "SIM112", # flake8-simplify code styles
    "SIM105", # these ignores are from flake8-simplify. please fix or ignore with commented reason
    "SIM108", # SIM108 ignored because we prefer if-else-block instead of ternary expression
-    "SIM110", # Checks for for loops that can be replaced with a builtin function, like any or all.
+    "SIM110",
    "SIM114", # Combine `if` branches using logical `or` operator
-    "SIM115", # Checks for cases where files are opened without using a context manager.
+    "SIM115",
    "SIM116", # Disable Use a dictionary instead of consecutive `if` statements
    "SIM117",
    "SIM118",
@ -184,6 +184,7 @@ ignore = [
    "TC006",
    # TODO: Remove Python-3.10 specific suppressions
    "B905",
+    "UP035",
 ]
 select = [
    "B",
--- a/setup.py
+++ b/setup.py
@ -630,37 +630,6 @@ def mirror_files_into_torchgen() -> None:
        raise RuntimeError("Check the file paths in `mirror_files_into_torchgen()`")


-def mirror_inductor_external_kernels() -> None:
-    """
-    Copy external kernels into Inductor so they are importable.
-    """
-    paths = [
-        (
-            CWD / "torch/_inductor/kernel/vendored_templates/cutedsl_grouped_gemm.py",
-            CWD
-            / "third_party/cutlass/examples/python/CuTeDSL/blackwell/grouped_gemm.py",
-        ),
-    ]
-    for new_path, orig_path in paths:
-        # Create the dirs involved in new_path if they don't exist
-        if not new_path.exists():
-            new_path.parent.mkdir(parents=True, exist_ok=True)
-
-        # Copy the files from the orig location to the new location
-        if orig_path.is_file():
-            shutil.copyfile(orig_path, new_path)
-            continue
-        if orig_path.is_dir():
-            if new_path.exists():
-                # copytree fails if the tree exists already, so remove it.
-                shutil.rmtree(new_path)
-            shutil.copytree(orig_path, new_path)
-            continue
-        raise RuntimeError(
-            "Check the file paths in `mirror_inductor_external_kernels()`"
-        )
-
-
 # ATTENTION: THIS IS AI SLOP
 def extract_variant_from_version(version: str) -> str:
    """Extract variant from version string, defaulting to 'cpu'."""
@ -1646,7 +1615,6 @@ def main() -> None:
    mirror_files_into_torchgen()
    if RUN_BUILD_DEPS:
        build_deps()
-        mirror_inductor_external_kernels()

    (
        ext_modules,
@ -1681,7 +1649,6 @@ def main() -> None:
        "_inductor/codegen/aoti_runtime/*.cpp",
        "_inductor/script.ld",
        "_inductor/kernel/flex/templates/*.jinja",
-        "_inductor/kernel/templates/*.jinja",
        "_export/serde/*.yaml",
        "_export/serde/*.thrift",
        "share/cmake/ATen/*.cmake",
--- a/test/ao/sparsity/test_data_sparsifier.py
+++ b/test/ao/sparsity/test_data_sparsifier.py
@ -208,7 +208,7 @@ class _BaseDataSparsiferTestCase(TestCase):
        assert len(sparsifier1.data_groups) == len(sparsifier2.data_groups)

        state1 = state_dict1["state"]
-        for name in state1:
+        for name in state1.keys():
            # compare mask
            assert name in sparsifier2.state
            assert "mask" in sparsifier2.state[name]
--- a/test/ao/sparsity/test_scheduler.py
+++ b/test/ao/sparsity/test_scheduler.py
@ -75,7 +75,6 @@ class TestScheduler(TestCase):

 class TestCubicScheduler(TestCase):
    def setUp(self):
-        super().setUp()
        self.model_sparse_config = [
            {"tensor_fqn": "0.weight", "sparsity_level": 0.8},
            {"tensor_fqn": "2.weight", "sparsity_level": 0.4},
--- a/test/ao/sparsity/test_sparsifier.py
+++ b/test/ao/sparsity/test_sparsifier.py
@ -119,7 +119,7 @@ class TestBaseSparsifier(TestCase):
        for idx in range(len(sparsifier0.groups)):
            mg0 = sparsifier0.groups[idx]
            mg1 = sparsifier1.groups[idx]
-            for key in mg0:
+            for key in mg0.keys():
                assert key in mg1
                if key == "module":
                    # We cannot compare modules as they are different
--- a/test/backends/xeon/test_launch.py
+++ b/test/backends/xeon/test_launch.py
@ -11,7 +11,6 @@ from torch.testing._internal.common_utils import IS_LINUX, run_tests, TestCase
@unittest.skipIf(not IS_LINUX, "Only works on linux")
 class TestTorchrun(TestCase):
    def setUp(self):
-        super().setUp()
        self._test_dir = tempfile.mkdtemp(prefix=self.__class__.__name__)

    def tearDown(self):
--- a/test/cpp/aoti_abi_check/CMakeLists.txt
+++ b/test/cpp/aoti_abi_check/CMakeLists.txt
@ -10,8 +10,6 @@ set(AOTI_ABI_CHECK_TEST_SRCS
  ${AOTI_ABI_CHECK_TEST_ROOT}/main.cpp
  ${AOTI_ABI_CHECK_TEST_ROOT}/test_cast.cpp
  ${AOTI_ABI_CHECK_TEST_ROOT}/test_devicetype.cpp
-  ${AOTI_ABI_CHECK_TEST_ROOT}/test_dispatch.cpp
-  ${AOTI_ABI_CHECK_TEST_ROOT}/test_dispatch_v2.cpp
  ${AOTI_ABI_CHECK_TEST_ROOT}/test_dtype.cpp
  ${AOTI_ABI_CHECK_TEST_ROOT}/test_exception.cpp
  ${AOTI_ABI_CHECK_TEST_ROOT}/test_headeronlyarrayref.cpp
--- a/test/cpp/aoti_abi_check/test_dispatch.cpp
+++ b/test/cpp/aoti_abi_check/test_dispatch.cpp
@ -1,82 +0,0 @@
-#include <gtest/gtest.h>
-
-#include <torch/headeronly/core/Dispatch.h>
-#include <torch/headeronly/core/Dispatch_v2.h>
-
-// MY_PRIVATE_CHECK_SELECTIVE_BUILD is a prelude to case block. For
-// testing, we do nothing:
-#define MY_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type) /* empty */
-
-#define MY_PRIVATE_CASE_TYPE_USING_HINT(...) \
-  THO_PRIVATE_CASE_TYPE_USING_HINT_TMPL(     \
-      MY_PRIVATE_CHECK_SELECTIVE_BUILD, __VA_ARGS__)
-
-#define MY_DISPATCH_CASE(...) \
-  THO_DISPATCH_CASE_TMPL(MY_PRIVATE_CASE_TYPE_USING_HINT, __VA_ARGS__)
-
-// MY_RECORD_KERNEL_FUNCTION_DTYPE is a prelude to switch
-// statement. For testing, we just avoid unused variable warning:
-#define MY_RECORD_KERNEL_FUNCTION_DTYPE(DISPATCHNAME, ENUMTYPE) \
-  (void)DISPATCHNAME
-
-// MY_CHECK_NOT_IMPLEMENTED is called in switch default block. For
-// testing, we count case mismatches:
-#define MY_CHECK_NOT_IMPLEMENTED(...) default_count++
-
-#define MY_DISPATCH_SWITCH(...) \
-  THO_DISPATCH_SWITCH_TMPL(     \
-      MY_RECORD_KERNEL_FUNCTION_DTYPE, MY_CHECK_NOT_IMPLEMENTED, __VA_ARGS__)
-
-// MY_CASE_FUNCTION is called in a case block. For testing, we count
-// case matches and ensure that scalar_t/index_t type is defined:
-#define MY_CASE_FUNCTION \
-  [&] {                  \
-    count++;             \
-    scalar_t tmp;        \
-    (void)tmp;           \
-  }
-#define MY_INDEX_CASE_FUNCTION \
-  [&] {                        \
-    count++;                   \
-    index_t tmp;               \
-    (void)tmp;                 \
-  }
-
-#define DEFINE_ITEM(TYPE, SCALARTYPE) ScalarType::SCALARTYPE,
-
-#define MY_DISPATCH_V2(TYPE, NAME, BODY, ...) \
-  THO_DISPATCH_V2_TMPL(                       \
-      MY_DISPATCH_SWITCH,                     \
-      MY_DISPATCH_CASE,                       \
-      TYPE,                                   \
-      NAME,                                   \
-      AT_WRAP(BODY),                          \
-      __VA_ARGS__)
-
-#define TEST_DISPATCH_V2(NAME, EXPECTEDCOUNT, ...)                             \
-  TEST(TestDispatchV2, NAME) {                                                 \
-    using torch::headeronly::ScalarType;                                       \
-    using torch::headeronly::impl::ScalarTypeToCPPTypeT;                       \
-    int8_t total_count = 0;                                                    \
-    int8_t count = 0;                                                          \
-    int8_t default_count = 0;                                                  \
-    for (ScalarType t :                                                        \
-         {AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_ITEM)}) {       \
-      total_count++;                                                           \
-      MY_DISPATCH_V2(t, "test_my_dispatch_v2", MY_CASE_FUNCTION, __VA_ARGS__); \
-    }                                                                          \
-    EXPECT_EQ(count, EXPECTEDCOUNT);                                           \
-    EXPECT_EQ(default_count + count, total_count);                             \
-  }
-
-TEST_DISPATCH_V2(AT_FLOAT8_TYPES_, 5, AT_FLOAT8_TYPES);
-TEST_DISPATCH_V2(AT_INTEGRAL_TYPES_, 5, AT_INTEGRAL_TYPES);
-TEST_DISPATCH_V2(AT_FLOATING_TYPES_, 2, AT_FLOATING_TYPES);
-TEST_DISPATCH_V2(AT_BAREBONES_UNSIGNED_TYPES_, 3, AT_BAREBONES_UNSIGNED_TYPES);
-TEST_DISPATCH_V2(AT_INTEGRAL_TYPES_V2_, 8, AT_INTEGRAL_TYPES_V2);
-TEST_DISPATCH_V2(AT_COMPLEX_TYPES_, 2, AT_COMPLEX_TYPES);
-TEST_DISPATCH_V2(AT_QINT_TYPES_, 3, AT_QINT_TYPES);
-TEST_DISPATCH_V2(AT_ALL_TYPES_, 7, AT_ALL_TYPES);
-TEST_DISPATCH_V2(AT_ALL_TYPES_AND_COMPLEX_, 9, AT_ALL_TYPES_AND_COMPLEX);
-
-#undef DEFINE_ITEM
--- a/test/cpp/aoti_abi_check/test_dispatch_v2.cpp
+++ b/test/cpp/aoti_abi_check/test_dispatch_v2.cpp
@ -1,45 +0,0 @@
-#include <gtest/gtest.h>
-#include <torch/headeronly/core/Dispatch_v2.h>
-#include <torch/headeronly/util/Exception.h>
-
-#define DEFINE_ITEM(TYPE, SCALARTYPE) ScalarType::SCALARTYPE,
-
-#define TEST_DISPATCH_V2(NAME, EXPECTEDCOUNT, ...)                       \
-  TEST(TestThoDispatchV2, NAME) {                                        \
-    using torch::headeronly::ScalarType;                                 \
-    using torch::headeronly::impl::ScalarTypeToCPPTypeT;                 \
-    int8_t total_count = 0;                                              \
-    int8_t count = 0;                                                    \
-    int8_t default_count = 0;                                            \
-    for (ScalarType t :                                                  \
-         {AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_ITEM)}) { \
-      total_count++;                                                     \
-      try {                                                              \
-        THO_DISPATCH_V2(                                                 \
-            t,                                                           \
-            "test_tho_dispatch_v2",                                      \
-            [&] {                                                        \
-              count++;                                                   \
-              scalar_t tmp;                                              \
-              (void)tmp;                                                 \
-            },                                                           \
-            __VA_ARGS__);                                                \
-      } catch (...) {                                                    \
-        default_count++; /* counts mismatches */                         \
-      }                                                                  \
-    }                                                                    \
-    EXPECT_EQ(count, EXPECTEDCOUNT);                                     \
-    EXPECT_EQ(default_count + count, total_count);                       \
-  }
-
-TEST_DISPATCH_V2(AT_FLOAT8_TYPES_, 5, AT_FLOAT8_TYPES);
-TEST_DISPATCH_V2(AT_INTEGRAL_TYPES_, 5, AT_INTEGRAL_TYPES);
-TEST_DISPATCH_V2(AT_FLOATING_TYPES_, 2, AT_FLOATING_TYPES);
-TEST_DISPATCH_V2(AT_BAREBONES_UNSIGNED_TYPES_, 3, AT_BAREBONES_UNSIGNED_TYPES);
-TEST_DISPATCH_V2(AT_INTEGRAL_TYPES_V2_, 8, AT_INTEGRAL_TYPES_V2);
-TEST_DISPATCH_V2(AT_COMPLEX_TYPES_, 2, AT_COMPLEX_TYPES);
-TEST_DISPATCH_V2(AT_QINT_TYPES_, 3, AT_QINT_TYPES);
-TEST_DISPATCH_V2(AT_ALL_TYPES_, 7, AT_ALL_TYPES);
-TEST_DISPATCH_V2(AT_ALL_TYPES_AND_COMPLEX_, 9, AT_ALL_TYPES_AND_COMPLEX);
-
-#undef DEFINE_ITEM
--- a/Show More
+++ b/Show More