Update (base update)

[ghstack-poisoned]
2025-11-13 08:04:41 +08:00 · 2025-11-05 17:05:14 +00:00 · 2025-10-28 09:57:49 -07:00
741 changed files with 7271 additions and 18717 deletions
--- a/.ci/docker/almalinux/Dockerfile
+++ b/.ci/docker/almalinux/Dockerfile
@ -7,13 +7,13 @@ ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8

-ARG DEVTOOLSET_VERSION=13
+ARG DEVTOOLSET_VERSION=11

 RUN yum -y update
 RUN yum -y install epel-release
 # install glibc-langpack-en make sure en_US.UTF-8 locale is available
 RUN yum -y install glibc-langpack-en
-RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-gcc gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran gcc-toolset-${DEVTOOLSET_VERSION}-gdb
+RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
 # Just add everything as a safe.directory for git since these will be used in multiple places with git
 RUN git config --global --add safe.directory '*'
 ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
@ -41,7 +41,6 @@ RUN bash ./install_conda.sh && rm install_conda.sh
 # Install CUDA
 FROM base as cuda
 ARG CUDA_VERSION=12.6
-ARG DEVTOOLSET_VERSION=13
 RUN rm -rf /usr/local/cuda-*
 ADD ./common/install_cuda.sh install_cuda.sh
 COPY ./common/install_nccl.sh install_nccl.sh
@ -51,8 +50,7 @@ ENV CUDA_HOME=/usr/local/cuda-${CUDA_VERSION}
 # Preserve CUDA_VERSION for the builds
 ENV CUDA_VERSION=${CUDA_VERSION}
 # Make things in our path by default
-ENV PATH=/usr/local/cuda-${CUDA_VERSION}/bin:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
-
+ENV PATH=/usr/local/cuda-${CUDA_VERSION}/bin:$PATH

 FROM cuda as cuda12.6
 RUN bash ./install_cuda.sh 12.6
@ -70,22 +68,8 @@ FROM cuda as cuda13.0
 RUN bash ./install_cuda.sh 13.0
 ENV DESIRED_CUDA=13.0

-FROM ${ROCM_IMAGE} as rocm_base
-ARG DEVTOOLSET_VERSION=13
-ENV LC_ALL en_US.UTF-8
-ENV LANG en_US.UTF-8
-ENV LANGUAGE en_US.UTF-8
-# Install devtoolset on ROCm base image
-RUN yum -y update && \
-    yum -y install epel-release && \
-    yum -y install glibc-langpack-en && \
-    yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-gcc gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran gcc-toolset-${DEVTOOLSET_VERSION}-gdb
-RUN git config --global --add safe.directory '*'
-ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
-
-FROM rocm_base as rocm
+FROM ${ROCM_IMAGE} as rocm
 ARG PYTORCH_ROCM_ARCH
-ARG DEVTOOLSET_VERSION=13
 ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
 ADD ./common/install_mkl.sh install_mkl.sh
 RUN bash ./install_mkl.sh && rm install_mkl.sh
@ -104,7 +88,6 @@ COPY --from=cuda13.0  /usr/local/cuda-13.0 /usr/local/cuda-13.0

 # Final step
 FROM ${BASE_TARGET} as final
-ARG DEVTOOLSET_VERSION=13
 COPY --from=openssl            /opt/openssl           /opt/openssl
 COPY --from=patchelf           /patchelf              /usr/local/bin/patchelf
 COPY --from=conda              /opt/conda             /opt/conda
--- a/.ci/docker/almalinux/build.sh
+++ b/.ci/docker/almalinux/build.sh
@ -36,7 +36,11 @@ case ${DOCKER_TAG_PREFIX} in
    ;;
  rocm*)
    BASE_TARGET=rocm
-    PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151"
+    PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+    # add gfx950, gfx115x conditionally starting in ROCm 7.0
+    if [[ "$ROCM_VERSION" == *"7.0"* ]]; then
+        PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
+    fi
    EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
    ;;
  *)
@ -59,7 +63,7 @@ docker build \
  --target final \
  --progress plain \
  --build-arg "BASE_TARGET=${BASE_TARGET}" \
-  --build-arg "DEVTOOLSET_VERSION=13" \
+  --build-arg "DEVTOOLSET_VERSION=11" \
  ${EXTRA_BUILD_ARGS} \
  -t ${tmp_tag} \
  $@ \
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -168,18 +168,6 @@ case "$tag" in
    VISION=yes
    TRITON=yes
    ;;
-  pytorch-linux-jammy-py3.11-clang12)
-    ANACONDA_PYTHON_VERSION=3.11
-    CLANG_VERSION=12
-    VISION=no
-    TRITON=no
-    ;;
-  pytorch-linux-jammy-py3.12-clang12)
-    ANACONDA_PYTHON_VERSION=3.12
-    CLANG_VERSION=12
-    VISION=no
-    TRITON=no
-    ;;
  pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-jammy-rocm-n-py3-benchmarks | pytorch-linux-noble-rocm-n-py3)
    if [[ $tag =~ "jammy" ]]; then
      ANACONDA_PYTHON_VERSION=3.10
@ -207,9 +195,9 @@ case "$tag" in
    NINJA_VERSION=1.9.0
    TRITON=yes
    ;;
-  pytorch-linux-noble-xpu-n-py3 | pytorch-linux-noble-xpu-n-py3-inductor-benchmarks)
+  pytorch-linux-jammy-xpu-n-py3 | pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks)
    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=13
+    GCC_VERSION=11
    VISION=yes
    XPU_VERSION=2025.2
    NINJA_VERSION=1.9.0
@ -260,12 +248,6 @@ case "$tag" in
    HALIDE=yes
    TRITON=yes
    ;;
-  pytorch-linux-jammy-cuda12.8-py3.12-pallas)
-    CUDA_VERSION=12.8.1
-    ANACONDA_PYTHON_VERSION=3.12
-    GCC_VERSION=11
-    PALLAS=yes
-    ;;
  pytorch-linux-jammy-py3.12-triton-cpu)
    CUDA_VERSION=12.6
    ANACONDA_PYTHON_VERSION=3.12
@ -279,9 +261,9 @@ case "$tag" in
    PYTHON_VERSION=3.10
    CUDA_VERSION=12.8.1
    ;;
-  pytorch-linux-jammy-aarch64-py3.10-gcc13)
+  pytorch-linux-jammy-aarch64-py3.10-gcc11)
    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=13
+    GCC_VERSION=11
    ACL=yes
    VISION=yes
    OPENBLAS=yes
@ -289,19 +271,9 @@ case "$tag" in
    # from pytorch/llvm:9.0.1 is x86 specific
    SKIP_LLVM_SRC_BUILD_INSTALL=yes
    ;;
-  pytorch-linux-jammy-aarch64-py3.10-clang21)
+  pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks)
    ANACONDA_PYTHON_VERSION=3.10
-    CLANG_VERSION=21
-    ACL=yes
-    VISION=yes
-    OPENBLAS=yes
-    # snadampal: skipping llvm src build install because the current version
-    # from pytorch/llvm:9.0.1 is x86 specific
-    SKIP_LLVM_SRC_BUILD_INSTALL=yes
-    ;;
-  pytorch-linux-jammy-aarch64-py3.10-gcc13-inductor-benchmarks)
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=13
+    GCC_VERSION=11
    ACL=yes
    VISION=yes
    OPENBLAS=yes
@ -387,7 +359,6 @@ docker build \
       --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
       --build-arg "EXECUTORCH=${EXECUTORCH}" \
       --build-arg "HALIDE=${HALIDE}" \
-       --build-arg "PALLAS=${PALLAS}" \
       --build-arg "XPU_VERSION=${XPU_VERSION}" \
       --build-arg "UNINSTALL_DILL=${UNINSTALL_DILL}" \
       --build-arg "ACL=${ACL:-}" \
--- a/.ci/docker/ci_commit_pins/jax.txt
+++ b/.ci/docker/ci_commit_pins/jax.txt
@ -1 +0,0 @@
-0.8.0
--- a/.ci/docker/common/install_clang.sh
+++ b/.ci/docker/common/install_clang.sh
@ -8,8 +8,8 @@ if [ -n "$CLANG_VERSION" ]; then
    # work around ubuntu apt-get conflicts
    sudo apt-get -y -f install
    wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
-    if [[ $CLANG_VERSION -ge 18 ]]; then
-      apt-add-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-${CLANG_VERSION} main"
+    if [[ $CLANG_VERSION == 18 ]]; then
+      apt-add-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-18 main"
    fi
  fi

--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -129,7 +129,7 @@ function install_129 {
 }

 function install_128 {
-  CUDNN_VERSION=9.8.0.87
+  CUDNN_VERSION=9.10.2.21
  echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
  # install CUDA 12.8.1 in the same container
  install_cuda 12.8.1 cuda_12.8.1_570.124.06_linux
--- a/.ci/docker/common/install_gcc.sh
+++ b/.ci/docker/common/install_gcc.sh
@ -7,11 +7,11 @@ if [ -n "$GCC_VERSION" ]; then
  # Need the official toolchain repo to get alternate packages
  add-apt-repository ppa:ubuntu-toolchain-r/test
  apt-get update
-  apt-get install -y g++-$GCC_VERSION gfortran-$GCC_VERSION
+  apt-get install -y g++-$GCC_VERSION
  update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-"$GCC_VERSION" 50
  update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-"$GCC_VERSION" 50
  update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-"$GCC_VERSION" 50
-  update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-"$GCC_VERSION" 50
+

  # Cleanup package manager
  apt-get autoclean && apt-get clean
--- a/.ci/docker/common/install_jax.sh
+++ b/.ci/docker/common/install_jax.sh
@ -1,40 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
-
-# Get the pinned JAX version (same for all CUDA versions)
-JAX_VERSION=$(get_pinned_commit /ci_commit_pins/jax)
-
-function install_jax_12() {
-  echo "Installing JAX ${JAX_VERSION} with CUDA 12 support"
-  pip_install "jax[cuda12]==${JAX_VERSION}" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
-
-  # Verify installation
-  python -c "import jax"  # check for errors
-  echo "JAX ${JAX_VERSION} installation completed successfully for CUDA 12"
-}
-
-function install_jax_13() {
-  echo "Installing JAX ${JAX_VERSION} with CUDA 13 support"
-  pip_install "jax[cuda13]==${JAX_VERSION}" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
-
-  # Verify installation
-  python -c "import jax"  # check for errors
-  echo "JAX ${JAX_VERSION} installation completed successfully for CUDA 13"
-}
-
-# idiomatic parameter and option handling in sh
-while test $# -gt 0
-do
-    case "$1" in
-    12.4|12.6|12.6.*|12.8|12.8.*|12.9|12.9.*) install_jax_12;
-        ;;
-    13.0|13.0.*) install_jax_13;
-        ;;
-    *) echo "bad argument $1"; exit 1
-        ;;
-    esac
-    shift
-done
--- a/.ci/docker/common/install_libgomp.sh
+++ b/.ci/docker/common/install_libgomp.sh
@ -1,56 +0,0 @@
-#!/bin/bash
-# Script used only in CD pipeline
-
-set -ex
-
-# install dependencies
-dnf -y install gmp-devel libmpc-devel texinfo flex bison
-
-cd /usr/local/src
-# fetch source for gcc 13
-git clone --depth 1 --single-branch -b releases/gcc-13.3.0 https://github.com/gcc-mirror/gcc.git gcc-13.3.0
-
-mkdir -p gcc-13.3.0/build-gomp
-cd gcc-13.3.0/build-gomp
-
-# configure gcc build
-# I got these flags by:
-# 1. downloading the source rpm for gcc-11 on AlmaLinux 8 container
-#    dnf install -y dnf-plugins-core rpmdevtools
-#   dnf download --source libgomp
-# 2. extracting the gcc.spec from the source.
-#    rpmdev-extract gcc-xx.src.rpm
-# 3. extracting optflags and ld_flags from gcc.spec:
-#    rpm --eval '%{optflags}'
-#    rpm --eval '%{build_ldflags}'
-#
-# I had to remove the following flags because they didn't compile for this version of libgomp:
-#   -Werror=format-security
-#   -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1
-#   -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1
-#
-# I added -march=armv8-a -mtune=generic to make them explicit. I don't think they're strictly needed.
-
-OPT_FLAGS='-O2 -march=armv8-a -mtune=generic'\
-' -fexceptions -g -grecord-gcc-switches -pipe -Wall'\
-' -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS'\
-' -fstack-protector-strong -fasynchronous-unwind-tables'\
-' -fstack-clash-protection'
-
-LDFLAGS='-Wl,-z,relro -Wl,--as-needed -Wl,-z,now'
-
-CFLAGS="$OPT_FLAGS" \
-CXXFLAGS="$OPT_FLAGS" \
-LDFLAGS="$LDFLAGS" \
-../configure \
-  --prefix=/usr \
-  --libdir=/usr/lib64 \
-  --enable-languages=c,c++ \
-  --disable-multilib \
-  --disable-bootstrap \
-  --enable-libgomp
-
-# only build libgomp
-make -j$(nproc) all-target-libgomp
-
-make install-target-libgomp
--- a/.ci/docker/common/install_openblas.sh
+++ b/.ci/docker/common/install_openblas.sh
@ -10,7 +10,6 @@ git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION}" -

 OPENBLAS_CHECKOUT_DIR="OpenBLAS"
 OPENBLAS_BUILD_FLAGS="
-CC=gcc
 NUM_THREADS=128
 USE_OPENMP=1
 NO_SHARED=0
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -9,7 +9,7 @@ set -xe

 function install_ubuntu() {
    . /etc/os-release
-    if [[ ! " jammy noble " =~ " ${VERSION_CODENAME} " ]]; then
+    if [[ ! " jammy " =~ " ${VERSION_CODENAME} " ]]; then
        echo "Ubuntu version ${VERSION_CODENAME} not supported"
        exit
    fi
@ -35,24 +35,25 @@ function install_ubuntu() {
    # The xpu-smi packages
    apt-get install -y flex bison xpu-smi

-    # Compute and Media Runtimes
-    if [[ " ${VERSION_CODENAME} " =~ " noble " ]]; then
+    if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
+        # Compute and Media Runtimes
        apt-get install -y \
-            intel-opencl-icd libze-intel-gpu1 libze1 \
-            intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
-            libegl-mesa0 libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
+            intel-opencl-icd intel-level-zero-gpu level-zero \
+            intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \
+            libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
            libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
-            mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc
-    else # jammy
+            mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
+        # Development Packages
+        apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
+    else # rolling driver
        apt-get install -y \
            intel-opencl-icd libze-intel-gpu1 libze1 \
            intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
            libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
            libglapi-mesa libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
            mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc
+        apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev
    fi
-    # Development Packages
-    apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev

    # Install Intel Support Packages
    apt-get install -y ${XPU_PACKAGES}
@ -65,7 +66,7 @@ function install_ubuntu() {
 function install_rhel() {
    . /etc/os-release
    if [[ "${ID}" == "rhel" ]]; then
-        if [[ ! " 8.8 8.10 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
+        if [[ ! " 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
            echo "RHEL version ${VERSION_ID} not supported"
            exit
        fi
@ -146,7 +147,7 @@ function install_sles() {
 XPU_DRIVER_VERSION=""
 if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
    # Use GPU driver LTS releases
-    XPU_DRIVER_VERSION="/lts/2523"
+    XPU_DRIVER_VERSION="/lts/2350"
 fi

 # Default use Intel® oneAPI Deep Learning Essentials 2025.1
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -49,7 +49,11 @@ case ${DOCKER_TAG_PREFIX} in
        fi
        BASE_TARGET=rocm
        GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
-        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151"
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+        # add gfx950, gfx115x conditionally starting in ROCm 7.0
+        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
+            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
+        fi
        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
        ;;
    *)
--- a/.ci/docker/manywheel/Dockerfile_2_28_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64
@ -50,10 +50,6 @@ RUN rm install_ninja.sh
 ENV PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH

-# Build a newer version of libgomp than that supported in in Almalinux 8.
-COPY ./common/install_libgomp.sh install_libgomp.sh
-RUN bash ./install_libgomp.sh && rm install_libgomp.sh
-
 # git236+ would refuse to run git commands in repos owned by other users
 # Which causes version check to fail, as pytorch repo is bind-mounted into the image
 # Override this behaviour by treating every folder as safe
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -87,7 +87,11 @@ case ${image} in
        MANY_LINUX_VERSION="2_28"
        DEVTOOLSET_VERSION="11"
        GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
-        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151"
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+        # add gfx950, gfx115x conditionally starting in ROCm 7.0
+        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
+            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
+        fi
        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
        ;;
    manylinux2_28-builder:xpu)
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,11 +1,15 @@
-sphinx==7.2.6
+sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
-#Pinned versions: 7.2.6
+#Pinned versions: 5.3.0

-pytorch_sphinx_theme2==0.2.0
-#Description: This is needed to generate PyTorch docs
-#Pinned versions: 0.2.0
+standard-imghdr==3.13.0; python_version >= "3.13"
+#Description: This is needed by Sphinx, so it needs to be added here.
+# The reasons are as follows:
+# 1) This module has been removed from the Python standard library since Python 3.13(https://peps.python.org/pep-0594/#imghdr);
+# 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13.
+# Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency.

+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@71e55749be14ceb56e7f8211a9fb649866b87ad4#egg=pytorch_sphinx_theme2
 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
 # something related to Docker setup. We can investigate this later.
@ -32,17 +36,17 @@ tensorboard==2.18.0 ; python_version >= "3.13"
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 2.13.0

-breathe==4.36.0
+breathe==4.34.0
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 4.36.0
+#Pinned versions: 4.34.0

-exhale==0.3.7
+exhale==0.2.3
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 0.3.7
+#Pinned versions: 0.2.3

-docutils==0.20
+docutils==0.16
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 0.20
+#Pinned versions: 0.16

 bs4==0.0.1
 #Description: This is used to generate PyTorch C++ docs
@ -52,13 +56,13 @@ IPython==8.12.0
 #Description: This is used to generate PyTorch functorch docs
 #Pinned versions: 8.12.0

-myst-nb==1.3.0
+myst-nb==0.17.2
 #Description: This is used to generate PyTorch functorch and torch.compile docs.
-#Pinned versions: 1.3.0
+#Pinned versions: 0.17.2

 # The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
 python-etcd==0.4.5
 sphinx-copybutton==0.5.0
-sphinx-design==0.6.1
+sphinx-design==0.4.0
 sphinxcontrib-mermaid==1.0.0
-myst-parser==4.0.1
+myst-parser==0.18.1
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -143,15 +143,6 @@ COPY ci_commit_pins/halide.txt halide.txt
 RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
 RUN rm install_halide.sh common_utils.sh halide.txt

-ARG PALLAS
-ARG CUDA_VERSION
-# Install JAX with CUDA support (for Pallas)
-COPY ./common/install_jax.sh install_jax.sh
-COPY ./common/common_utils.sh common_utils.sh
-COPY ./ci_commit_pins/jax.txt /ci_commit_pins/jax.txt
-RUN if [ -n "${PALLAS}" ]; then bash ./install_jax.sh ${CUDA_VERSION}; fi
-RUN rm -f install_jax.sh common_utils.sh /ci_commit_pins/jax.txt
-
 ARG ONNX
 # Install ONNX dependencies
 COPY ./common/install_onnx.sh ./common/common_utils.sh ./
--- a/.ci/lumen_cli/cli/lib/common/cli_helper.py
+++ b/.ci/lumen_cli/cli/lib/common/cli_helper.py
@ -8,11 +8,9 @@ from abc import ABC, abstractmethod


 try:
-    from collections.abc import Callable  # Python 3.11+
-    from typing import Any, Required, TypedDict
+    from typing import Any, Callable, Required, TypedDict  # Python 3.11+
 except ImportError:
-    from collections.abc import Callable
-    from typing import Any, TypedDict
+    from typing import Any, Callable, TypedDict

    from typing_extensions import Required  # Fallback for Python <3.11

--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -168,16 +168,14 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/compiler/latest/env/vars.sh
  # shellcheck disable=SC1091
-  source /opt/intel/oneapi/umf/latest/env/vars.sh
-  # shellcheck disable=SC1091
  source /opt/intel/oneapi/ccl/latest/env/vars.sh
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/mpi/latest/env/vars.sh
-  # shellcheck disable=SC1091
-  source /opt/intel/oneapi/pti/latest/env/vars.sh
  # Enable XCCL build
  export USE_XCCL=1
  export USE_MPI=0
+  # XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA
+  export USE_KINETO=0
  export TORCH_XPU_ARCH_LIST=pvc
 fi

--- a/.ci/pytorch/python_doc_push_script.sh
+++ b/.ci/pytorch/python_doc_push_script.sh
@ -89,41 +89,23 @@ if [ "$is_main_doc" = true ]; then

  make coverage
  # Now we have the coverage report, we need to make sure it is empty.
-  # Sphinx 7.2.6+ format: python.txt contains a statistics table with a TOTAL row
-  # showing the undocumented count in the third column.
-  # Example: | TOTAL | 99.83% | 2 |
+  # Count the number of lines in the file and turn that number into a variable
+  # $lines. The `cut -f1 ...` is to only parse the number, not the filename
+  # Skip the report header by subtracting 2: the header will be output even if
+  # there are no undocumented items.
  #
  # Also: see docs/source/conf.py for "coverage_ignore*" items, which should
  # be documented then removed from there.
-
-  # Extract undocumented count from TOTAL row in Sphinx 7.2.6 statistics table
-  # The table format is: | Module | Coverage | Undocumented |
-  # Extract the third column (undocumented count) from the TOTAL row
-  undocumented=$(grep "| TOTAL" build/coverage/python.txt | awk -F'|' '{print $4}' | tr -d ' ')
-
-  if [ -z "$undocumented" ] || ! [[ "$undocumented" =~ ^[0-9]+$ ]]; then
+  lines=$(wc -l build/coverage/python.txt 2>/dev/null |cut -f1 -d' ')
+  undocumented=$((lines - 2))
+  if [ $undocumented -lt 0 ]; then
    echo coverage output not found
    exit 1
-  elif [ "$undocumented" -gt 0 ]; then
-    set +x  # Disable command echoing for cleaner output
-    echo ""
-    echo "====================="
-    echo "UNDOCUMENTED OBJECTS:"
-    echo "====================="
-    echo ""
-    # Find the line number of the TOTAL row and print only what comes after it
-    total_line=$(grep -n "| TOTAL" build/coverage/python.txt | cut -d: -f1)
-    if [ -n "$total_line" ]; then
-      # Print only the detailed list (skip the statistics table)
-      tail -n +$((total_line + 2)) build/coverage/python.txt
-    else
-      # Fallback to showing entire file if TOTAL line not found
-      cat build/coverage/python.txt
-    fi
-    echo ""
+  elif [ $undocumented -gt 0 ]; then
+    echo undocumented objects found:
+    cat build/coverage/python.txt
    echo "Make sure you've updated relevant .rsts in docs/source!"
-    echo "You can reproduce locally by running 'cd docs && make coverage && tail -n +\$((grep -n \"| TOTAL\" build/coverage/python.txt | cut -d: -f1) + 2)) build/coverage/python.txt'"
-    set -x  # Re-enable command echoing
+    echo "You can reproduce locally by running 'cd docs && make coverage && cat build/coverage/python.txt'"
    exit 1
  fi
 else
--- a/.ci/pytorch/smoke_test/smoke_test.py
+++ b/.ci/pytorch/smoke_test/smoke_test.py
@ -272,6 +272,18 @@ def smoke_test_cuda(
        torch_cudnn_version = cudnn_to_version_str(torch.backends.cudnn.version())
        print(f"Torch cuDNN version: {torch_cudnn_version}")

+        torch_cudnn_compile_version = torch._C._cudnn.getCompileVersion()
+        print(f"Torch cuDNN compile-time version: {torch_cudnn_compile_version}")
+        torch_cudnn_runtime_version = tuple(
+            [int(x) for x in torch_cudnn_version.split(".")]
+        )
+        if torch_cudnn_runtime_version != torch_cudnn_compile_version:
+            raise RuntimeError(
+                "cuDNN runtime version doesn't match comple version. "
+                f"Loaded: {torch_cudnn_runtime_version} "
+                f"Expected: {torch_cudnn_compile_version}"
+            )
+
        if sys.platform in ["linux", "linux2"]:
            torch_nccl_version = ".".join(str(v) for v in torch.cuda.nccl.version())
            print(f"Torch nccl; version: {torch_nccl_version}")
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -208,8 +208,6 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  source /opt/intel/oneapi/ccl/latest/env/vars.sh
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/mpi/latest/env/vars.sh
-  # shellcheck disable=SC1091
-  source /opt/intel/oneapi/pti/latest/env/vars.sh
  # Check XPU status before testing
  timeout 30 xpu-smi discovery || true
 fi
@ -339,7 +337,7 @@ test_python() {

 test_python_smoke() {
  # Smoke tests for H100/B200
-  time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune inductor/test_cutedsl_grouped_mm $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  assert_git_not_dirty
 }

@ -826,11 +824,6 @@ test_inductor_halide() {
  assert_git_not_dirty
 }

-test_inductor_pallas() {
-  python test/run_test.py --include inductor/test_pallas.py --verbose
-  assert_git_not_dirty
-}
-
 test_inductor_triton_cpu() {
  python test/run_test.py --include inductor/test_triton_cpu_backend.py inductor/test_torchinductor_strided_blocks.py --verbose
  assert_git_not_dirty
@ -1731,8 +1724,6 @@ elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
  test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
  test_inductor_halide
-elif [[ "${TEST_CONFIG}" == *inductor-pallas* ]]; then
-  test_inductor_pallas
 elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then
  test_inductor_triton_cpu
 elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
--- a/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1
+++ b/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1
@ -70,7 +70,7 @@ sccache --zero-stats
 sccache --show-stats

 # Build the wheel
-python -m build --wheel --no-isolation
+python -m build --wheel --no-build-isolation
 if ($LASTEXITCODE -ne 0) { exit 1 }

 # Install the wheel locally
--- a/.github/ISSUE_TEMPLATE/release-feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/release-feature-request.yml
@ -1,11 +1,11 @@
-name: 🚀 New Feature for Release
+name: 🚀 Release highlight for proposed Feature
 description: Submit a Release highlight for proposed Feature
 labels: ["release-feature-request"]

 body:
 - type: textarea
  attributes:
-    label: New Feature for Release
+    label: Release highlight for proposed Feature
    description: >
      Example: “A torch.special module, analogous to SciPy's special module.”
 - type: input
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-ad5816f0eee1c873df1b7d371c69f1f811a89387
+3b0e7a6f192ca2715e7e6cbe5db007aea7165fe2
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@ -1 +1 @@
-ccb801b88af136454798b945175c4c87e636ac33
+cfbc5c2f1c798991715a6b06bb3ce46478c4487c
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-e4d25697f9dc5eedaf8f0a5bf085c62c5455a53a
+c8b09f5f77d6bf6fb7ed7a9aa83e5d8156b3a5e9
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -138,8 +138,7 @@
 - test/test_matmul_cuda.py
 - test/test_scaled_matmul_cuda.py
 - test/inductor/test_fp8.py
- aten/src/ATen/native/cuda/*Blas.cpp
- aten/src/ATen/cuda/CUDA*Blas.*
+- aten/src/ATen/native/cuda/Blas.cpp
 - torch/**/*cublas*
 - torch/_inductor/kernel/mm.py
 - test/inductor/test_max_autotune.py
@ -149,8 +148,7 @@
 - test/test_matmul_cuda.py
 - test/test_scaled_matmul_cuda.py
 - test/inductor/test_fp8.py
- aten/src/ATen/native/cuda/*Blas.cpp
- aten/src/ATen/cuda/CUDA*Blas.*
+- aten/src/ATen/native/cuda/Blas.cpp
 - torch/**/*cublas*
 - torch/_inductor/kernel/mm.py
 - test/inductor/test_max_autotune.py
@ -160,8 +158,7 @@
 - test/test_matmul_cuda.py
 - test/test_scaled_matmul_cuda.py
 - test/inductor/test_fp8.py
- aten/src/ATen/native/cuda/*Blas.cpp
- aten/src/ATen/cuda/CUDA*Blas.*
+- aten/src/ATen/native/cuda/Blas.cpp
 - torch/_inductor/kernel/mm.py
 - test/inductor/test_max_autotune.py
 - third_party/fbgemm
--- a/.github/nitpicks.yml
+++ b/.github/nitpicks.yml
@ -10,4 +10,3 @@
  pathFilter:
    - 'torch/csrc/inductor/aoti_torch/c/*'
    - 'torch/csrc/inductor/aoti_torch/generated/*'
-    - 'torch/csrc/stable/c/*'
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -2,8 +2,8 @@ tracking_issue: 24422
 ciflow_tracking_issue: 64124
 ciflow_push_tags:
 - ciflow/b200
- ciflow/b200-distributed
 - ciflow/b200-symm-mem
+- ciflow/b200-distributed
 - ciflow/binaries
 - ciflow/binaries_libtorch
 - ciflow/binaries_wheel
@ -22,8 +22,6 @@ ciflow_push_tags:
 - ciflow/inductor-perf-test-nightly-xpu
 - ciflow/inductor-periodic
 - ciflow/inductor-rocm
- ciflow/inductor-rocm-mi200
- ciflow/inductor-rocm-mi300
 - ciflow/linux-aarch64
 - ciflow/mps
 - ciflow/nightly
@ -35,13 +33,11 @@ ciflow_push_tags:
 - ciflow/quantization-periodic
 - ciflow/riscv64
 - ciflow/rocm
- ciflow/rocm-mi200
 - ciflow/rocm-mi300
 - ciflow/rocm-mi355
 - ciflow/rocm-navi31
 - ciflow/s390
 - ciflow/slow
- ciflow/slow-rocm-mi200
 - ciflow/torchbench
 - ciflow/triton_binaries
 - ciflow/trunk
--- a/.github/scripts/delete_old_branches.py
+++ b/.github/scripts/delete_old_branches.py
@ -1,11 +1,10 @@
 # Delete old branches
 import os
 import re
-from collections.abc import Callable
 from datetime import datetime
 from functools import lru_cache
 from pathlib import Path
-from typing import Any
+from typing import Any, Callable

 from github_utils import gh_fetch_json_dict, gh_graphql
 from gitutils import GitRepo
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@ -8,11 +8,10 @@ import re
 import subprocess
 import sys
 import warnings
-from collections.abc import Callable
 from enum import Enum
 from functools import cache
 from logging import info
-from typing import Any, Optional
+from typing import Any, Callable, Optional
 from urllib.request import Request, urlopen

 import yaml
--- a/.github/scripts/get_workflow_job_id.py
+++ b/.github/scripts/get_workflow_job_id.py
@ -11,8 +11,7 @@ import sys
 import time
 import urllib
 import urllib.parse
-from collections.abc import Callable
-from typing import Any, Optional
+from typing import Any, Callable, Optional
 from urllib.request import Request, urlopen


--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@ -3,9 +3,8 @@
 import json
 import os
 import warnings
-from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Any, cast, Optional, Union
+from typing import Any, Callable, cast, Optional, Union
 from urllib.error import HTTPError
 from urllib.parse import quote
 from urllib.request import Request, urlopen
--- a/.github/scripts/gitutils.py
+++ b/.github/scripts/gitutils.py
@ -4,10 +4,10 @@ import os
 import re
 import tempfile
 from collections import defaultdict
-from collections.abc import Callable, Iterator
+from collections.abc import Iterator
 from datetime import datetime
 from functools import wraps
-from typing import Any, cast, Optional, TypeVar, Union
+from typing import Any, Callable, cast, Optional, TypeVar, Union


 T = TypeVar("T")
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -17,12 +17,12 @@ import re
 import time
 import urllib.parse
 from collections import defaultdict
-from collections.abc import Callable, Iterable
+from collections.abc import Iterable
 from dataclasses import dataclass
 from functools import cache
 from pathlib import Path
 from re import Pattern
-from typing import Any, cast, NamedTuple, Optional
+from typing import Any, Callable, cast, NamedTuple, Optional
 from warnings import warn

 import yaml
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -56,8 +56,6 @@ jobs:
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
          pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
          pytorch-linux-jammy-py3.10-clang12,
-          pytorch-linux-jammy-py3.11-clang12,
-          pytorch-linux-jammy-py3.12-clang12,
          pytorch-linux-jammy-py3.13-clang12,
          pytorch-linux-jammy-py3.14-clang12,
          pytorch-linux-jammy-rocm-n-py3,
@ -67,10 +65,9 @@ jobs:
          pytorch-linux-jammy-py3.10-gcc11,
          pytorch-linux-jammy-py3-gcc11-inductor-benchmarks,
          pytorch-linux-jammy-py3.12-halide,
-          pytorch-linux-jammy-cuda12.8-py3.12-pallas,
          pytorch-linux-jammy-xpu-n-1-py3,
-          pytorch-linux-noble-xpu-n-py3,
-          pytorch-linux-noble-xpu-n-py3-inductor-benchmarks,
+          pytorch-linux-jammy-xpu-n-py3,
+          pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks,
          pytorch-linux-jammy-py3-clang18-asan,
          pytorch-linux-jammy-py3-clang12-onnx,
          pytorch-linux-jammy-linter,
@ -80,11 +77,9 @@ jobs:
          pytorch-linux-noble-riscv64-py3.12-gcc14
        ]
        include:
-          - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc13
+          - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11
            runner: linux.arm64.m7g.4xlarge
-          - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-clang21
-            runner: linux.arm64.m7g.4xlarge
-          - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc13-inductor-benchmarks
+          - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
            runner: linux.arm64.m7g.4xlarge
            timeout-minutes: 600
    # Docker uploads fail from LF runners, see https://github.com/pytorch/pytorch/pull/137358
--- a/.github/workflows/inductor-perf-test-nightly-aarch64.yml
+++ b/.github/workflows/inductor-perf-test-nightly-aarch64.yml
@ -72,7 +72,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runner: linux.arm64.m7g.4xlarge
      build-environment: linux-jammy-aarch64-py3.10
-      docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc13-inductor-benchmarks
+      docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
          { config: "inductor_huggingface_perf_cpu_aarch64", shard: 1, num_shards: 9, runner: "linux.arm64.m7g.metal" },
--- a/.github/workflows/inductor-perf-test-nightly-xpu.yml
+++ b/.github/workflows/inductor-perf-test-nightly-xpu.yml
@ -83,8 +83,8 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-noble-xpu-n-py3.10
-      docker-image-name: ci-image:pytorch-linux-noble-xpu-n-py3-inductor-benchmarks
+      build-environment: linux-jammy-xpu-n-py3.10
+      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks
      runner: linux.c7i.12xlarge
      test-matrix: |
        { include: [
@ -117,7 +117,7 @@ jobs:
    uses: ./.github/workflows/_xpu-test.yml
    needs: xpu-n-py3_10-inductor-benchmark-build
    with:
-      build-environment: linux-noble-xpu-n-py3.10
+      build-environment: linux-jammy-xpu-n-py3.10
      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-false-cppwrapper-true-aotinductor-true-freezing_cudagraphs-false-cudagraphs_low_precision-false
      docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
      test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
@ -137,7 +137,7 @@ jobs:
    uses: ./.github/workflows/_xpu-test.yml
    needs: xpu-n-py3_10-inductor-benchmark-build
    with:
-      build-environment: linux-noble-xpu-n-py3.10
+      build-environment: linux-jammy-xpu-n-py3.10
      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
      docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
      test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
--- a/.github/workflows/inductor-rocm-mi300.yml
+++ b/.github/workflows/inductor-rocm-mi300.yml
@ -7,7 +7,6 @@ on:
      - release/*
    tags:
      - ciflow/inductor-rocm/*
-      - ciflow/inductor-rocm-mi300/*
  workflow_dispatch:

 concurrency:
--- a/.github/workflows/inductor-rocm-mi200.yml
+++ b/.github/workflows/inductor-rocm-mi200.yml
@ -2,12 +2,12 @@ name: inductor-rocm

 on:
  schedule:
-    - cron: 0 */3 * * *
+    - cron: 0 * * * *
  push:
    branches:
      - release/*
    tags:
-      - ciflow/inductor-rocm-mi200/*
+      - ciflow/inductor-rocm/*
  workflow_dispatch:

 concurrency:
--- a/.github/workflows/inductor-unittest.yml
+++ b/.github/workflows/inductor-unittest.yml
@ -81,32 +81,6 @@ jobs:
      test-matrix: ${{ needs.inductor-halide-build.outputs.test-matrix }}
    secrets: inherit

-  inductor-pallas-build:
-    name: inductor-pallas-build
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      build-environment: linux-jammy-cuda12.8-py3.12-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-py3.12-pallas
-      cuda-arch-list: '8.9'
-      runner: linux.8xlarge.memory
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      test-matrix: |
-        { include: [
-          { config: "inductor-pallas", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu" },
-        ]}
-    secrets: inherit
-
-  inductor-pallas-test:
-    name: inductor-pallas-test
-    uses: ./.github/workflows/_linux-test.yml
-    needs: inductor-pallas-build
-    with:
-      build-environment: linux-jammy-py3.12-gcc11
-      docker-image: ${{ needs.inductor-pallas-build.outputs.docker-image }}
-      test-matrix: ${{ needs.inductor-pallas-build.outputs.test-matrix }}
-    secrets: inherit
-
  inductor-triton-cpu-build:
    name: inductor-triton-cpu-build
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/linux-aarch64.yml
+++ b/.github/workflows/linux-aarch64.yml
@ -33,7 +33,7 @@ jobs:
    with:
      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
      build-environment: linux-jammy-aarch64-py3.10
-      docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc13
+      docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11
      runner: linux.arm64.m7g.4xlarge
      test-matrix: |
        { include: [
--- a/.github/workflows/operator_benchmark.yml
+++ b/.github/workflows/operator_benchmark.yml
@ -60,7 +60,7 @@ jobs:
    with:
      build-environment: linux-jammy-aarch64-py3.10
      runner: linux.arm64.m7g.4xlarge
-      docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc13
+      docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11
      test-matrix: |
        { include: [
          { config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.arm64.m8g.4xlarge" },
--- a/.github/workflows/periodic-rocm-mi200.yml
+++ b/.github/workflows/periodic-rocm-mi200.yml
@ -11,6 +11,7 @@ on:
    - cron: 29 8 * * *  # about 1:29am PDT, for mem leak check and rerun disabled tests
  push:
    tags:
+      - ciflow/periodic/*
      - ciflow/periodic-rocm-mi200/*
    branches:
      - release/*
--- a/.github/workflows/periodic-rocm-mi300.yml
+++ b/.github/workflows/periodic-rocm-mi300.yml
@ -11,7 +11,6 @@ on:
    - cron: 29 8 * * *  # about 1:29am PDT, for mem leak check and rerun disabled tests
  push:
    tags:
-      - ciflow/periodic/*
      - ciflow/periodic-rocm-mi300/*
    branches:
      - release/*
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -342,16 +342,16 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
    secrets: inherit

-  linux-noble-xpu-n-py3_10-build:
-    name: linux-noble-xpu-n-py3.10
+  linux-jammy-xpu-n-py3_10-build:
+    name: linux-jammy-xpu-n-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      # This should sync with the build in xpu.yml but xpu uses a larger runner
      # sync-tag: linux-xpu-n-build
      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
-      build-environment: linux-noble-xpu-n-py3.10
-      docker-image-name: ci-image:pytorch-linux-noble-xpu-n-py3
+      build-environment: linux-jammy-xpu-n-py3.10
+      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 4, runner: "linux.idc.xpu" },
--- a/.github/workflows/rocm-mi300.yml
+++ b/.github/workflows/rocm-mi300.yml
@ -6,7 +6,6 @@ on:
      - main
      - release/*
    tags:
-      - ciflow/rocm/*
      - ciflow/rocm-mi300/*
  workflow_dispatch:
  schedule:
--- a/.github/workflows/rocm-mi200.yml
+++ b/.github/workflows/rocm-mi200.yml
@ -5,12 +5,11 @@ on:
    branches:
      - release/*
    tags:
-      - ciflow/rocm-mi200/*
+      - ciflow/rocm/*
  workflow_dispatch:
  schedule:
    - cron: 29 8 * * *  # about 1:29am PDT
-    - cron: 0 */3 * * *
-
+    - cron: 0 * * * *

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
--- a/.github/workflows/slow-rocm-mi200.yml
+++ b/.github/workflows/slow-rocm-mi200.yml
@ -1,81 +0,0 @@
-# This workflow is dedicated to host slow jobs that are run only periodically because
-# they are too slow to run in every commit.  The list of slow tests can be found in
-# https://github.com/pytorch/test-infra/blob/generated-stats/stats/slow-tests.json
-name: slow-rocm-mi200
-
-on:
-  push:
-    branches:
-      - release/*
-    tags:
-      - ciflow/slow/*
-      - ciflow/slow-rocm-mi200/*
-  schedule:
-    - cron: 0 */3 * * *
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
-  cancel-in-progress: true
-
-permissions:
-  id-token: write
-  contents: read
-
-jobs:
-  llm-td:
-    if: github.repository_owner == 'pytorch'
-    name: before-test
-    uses: ./.github/workflows/llm_td_retrieval.yml
-    permissions:
-      id-token: write
-      contents: read
-
-  target-determination:
-    name: before-test
-    uses: ./.github/workflows/target_determination.yml
-    needs: llm-td
-    permissions:
-      id-token: write
-      contents: read
-
-  get-label-type:
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-
-  linux-jammy-rocm-py3_10-build:
-    name: linux-jammy-rocm-py3.10
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-rocm-py3.10
-      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
-      sync-tag: rocm-build
-      test-matrix: |
-        { include: [
-          { config: "slow", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
-          { config: "slow", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
-        ]}
-    secrets: inherit
-
-  linux-jammy-rocm-py3_10-test:
-    permissions:
-      id-token: write
-      contents: read
-    name: linux-jammy-rocm-py3.10
-    uses: ./.github/workflows/_rocm-test.yml
-    needs:
-      - linux-jammy-rocm-py3_10-build
-      - target-determination
-    with:
-      build-environment: linux-jammy-rocm-py3.10
-      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
-    secrets: inherit
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -105,6 +105,36 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }}
    secrets: inherit

+  linux-jammy-rocm-py3_10-build:
+    name: linux-jammy-rocm-py3.10
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+      test-matrix: |
+        { include: [
+          { config: "slow", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
+          { config: "slow", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
+        ]}
+    secrets: inherit
+
+  linux-jammy-rocm-py3_10-test:
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-jammy-rocm-py3.10
+    uses: ./.github/workflows/_rocm-test.yml
+    needs:
+      - linux-jammy-rocm-py3_10-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
+    secrets: inherit
+
  linux-jammy-py3_10-clang18-asan-build:
    name: linux-jammy-py3.10-clang18-asan
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@ -11,16 +11,15 @@ on:
      - inductor
      - unstable
      - slow
-      - slow-rocm-mi200
      - unstable-periodic
      - inductor-periodic
-      - rocm-mi200
+      - rocm
      - rocm-mi300
      - rocm-mi355
      - inductor-micro-benchmark
      - inductor-micro-benchmark-x86
      - inductor-cu124
-      - inductor-rocm-mi200
+      - inductor-rocm
      - inductor-rocm-mi300
      - mac-mps
      - linux-aarch64
--- a/.github/workflows/xpu.yml
+++ b/.github/workflows/xpu.yml
@ -47,15 +47,15 @@ jobs:
        ]}
    secrets: inherit

-  linux-noble-xpu-n-py3_10-build:
-    name: linux-noble-xpu-n-py3.10
+  linux-jammy-xpu-n-py3_10-build:
+    name: linux-jammy-xpu-n-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      sync-tag: linux-xpu-n-build
      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
-      build-environment: linux-noble-xpu-n-py3.10
-      docker-image-name: ci-image:pytorch-linux-noble-xpu-n-py3
+      build-environment: linux-jammy-xpu-n-py3.10
+      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
      runner: linux.c7i.12xlarge
      test-matrix: |
        { include: [
@ -74,17 +74,17 @@ jobs:
        ]}
    secrets: inherit

-  linux-noble-xpu-n-py3_10-test:
-    name: linux-noble-xpu-n-py3.10
+  linux-jammy-xpu-n-py3_10-test:
+    name: linux-jammy-xpu-n-py3.10
    uses: ./.github/workflows/_xpu-test.yml
-    needs: linux-noble-xpu-n-py3_10-build
+    needs: linux-jammy-xpu-n-py3_10-build
    permissions:
      id-token: write
      contents: read
    with:
-      build-environment: linux-noble-xpu-n-py3.10
-      docker-image: ${{ needs.linux-noble-xpu-n-py3_10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-noble-xpu-n-py3_10-build.outputs.test-matrix }}
+      build-environment: linux-jammy-xpu-n-py3.10
+      docker-image: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.test-matrix }}
    secrets: inherit

  windows-xpu-n-1-build:
--- a/.gitignore
+++ b/.gitignore
@ -127,7 +127,6 @@ torch/test/
 torch/utils/benchmark/utils/valgrind_wrapper/callgrind.h
 torch/utils/benchmark/utils/valgrind_wrapper/valgrind.h
 torch/version.py
-torch/_inductor/kernel/vendored_templates/*
 minifier_launcher.py
 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_fwd_d*
 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd_d*
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -143,8 +143,7 @@ init_command = [
    'tools/linter/adapters/pip_init.py',
    '--dry-run={{DRYRUN}}',
    'numpy==1.26.4 ; python_version >= "3.10" and python_version <= "3.11"',
-    'numpy==2.1.0 ; python_version >= "3.12" and python_version <= "3.13"',
-    'numpy==2.3.4 ; python_version >= "3.14"',
+    'numpy==2.1.0 ; python_version >= "3.12"',
    'expecttest==0.3.0',
    'pyrefly==0.36.2',
    'sympy==1.13.3',
@ -1402,7 +1401,7 @@ init_command = [
    '--dry-run={{DRYRUN}}',
    'usort==1.0.8.post1',
    'isort==6.0.1',
-    'ruff==0.14.4',  # sync with RUFF
+    'ruff==0.13.1',  # sync with RUFF
 ]
 is_formatter = true

@ -1537,7 +1536,7 @@ init_command = [
    'python3',
    'tools/linter/adapters/pip_init.py',
    '--dry-run={{DRYRUN}}',
-    'ruff==0.14.4',  # sync with PYFMT
+    'ruff==0.13.1',  # sync with PYFMT
 ]
 is_formatter = true

--- a/6
+++ b/6
@ -210,12 +210,8 @@ torch/backends/cudnn/ @eqy @syed-ahmed @Aidyn-A
 /test/inductor/test_flex_attention.py @drisspg
 /test/inductor/test_flex_decoding.py @drisspg

-# Low Precision & Grouped GEMMs
+# Low Precision GEMMs
 /aten/src/ATen/native/cuda/Blas.cpp @drisspg @slayton58
-/aten/src/ATen/native/cuda/GroupedBlas.cpp @drisspg @slayton58
-/aten/src/ATen/native/cuda/ScaledBlas.cpp @drisspg @slayton58
 /aten/src/ATen/cuda/CUDABlas.cpp @drisspg @slayton58
 /aten/src/ATen/cuda/CUDABlas.h @drisspg @slayton58
-/aten/src/ATen/cuda/CUDAScaledBlas.cpp @drisspg @slayton58
-/aten/src/ATen/cuda/CUDAScaledBlas.h @drisspg @slayton58
 /test/test_scaled_matmul_cuda.py @drisspg @slayton58
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -18,7 +18,7 @@ aspects of contributing to PyTorch.
  - [Python Unit Testing](#python-unit-testing)
  - [Better local unit tests with `pytest`](#better-local-unit-tests-with-pytest)
  - [Local linting](#local-linting)
-    - [Running `pyrefly`](#running-pyrefly)
+    - [Running `mypy`](#running-mypy)
  - [C++ Unit Testing](#c-unit-testing)
  - [Run Specific CI Jobs](#run-specific-ci-jobs)
 - [Merging your Change](#merging-your-change)
@ -281,7 +281,7 @@ dependencies as well as the nightly binaries into the repo directory.
 **Prerequisites**:
 The following packages should be installed with `pip`:
 - `expecttest` and `hypothesis` - required to run tests
- `pyrefly` - recommended for type checking. [Pyrefly](https://pyrefly.org/)
+- `mypy` - recommended for linting
 - `pytest` - recommended to run tests more selectively
 Running
 ```
@ -350,32 +350,15 @@ make lint

 Learn more about the linter on the [lintrunner wiki page](https://github.com/pytorch/pytorch/wiki/lintrunner)

-#### Running `pyrefly`
+#### Running `mypy`

-[Pyrefly](https://pyrefly.org/) is a high-performance static type checker for Python. It provides fast type checking along with IDE features like autocomplete and instant error feedback.
-
-PyTorch uses Pyrefly for type checking across the codebase. The configuration is managed in `pyrefly.toml` at the root of the repository.
-
-**Getting Started with Pyrefly:**
-
-To run type checking on the PyTorch codebase:
-```bash
-pyrefly check
-```
-
-For more detailed error information with summaries:
-```bash
-pyrefly check --summarize-errors
-```
-
-**Learn More:**
- [Pyrefly Configuration](https://pyrefly.org/en/docs/configuration/) - Detailed configuration options
- [Pyrefly IDE Features](https://pyrefly.org/en/docs/IDE-features/) - Set up Pyrefly in your editor for real-time type checking
- [Python Typing Tutorial](https://pyrefly.org/en/docs/typing-for-python-developers/) - Learn about Python type annotations
+`mypy` is an optional static type checker for Python. We have multiple `mypy`
+configs for the PyTorch codebase that are automatically validated against whenever the linter is run.

 See [Guide for adding type annotations to
 PyTorch](https://github.com/pytorch/pytorch/wiki/Guide-for-adding-type-annotations-to-PyTorch)
-for PyTorch-specific guidance on how to set up `pyrefly` and tackle type annotation tasks in this codebase.
+for more information on how to set up `mypy` and tackle type annotation
+tasks.

 ### C++ Unit Testing

--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -174,12 +174,6 @@ class TORCH_API Context {
  static long versionCuDNN() {
    return detail::getCUDAHooks().versionCuDNN();
  }
-  static long versionRuntimeCuDNN() {
-    return detail::getCUDAHooks().versionRuntimeCuDNN();
-  }
-  static long versionCuDNNFrontend() {
-    return detail::getCUDAHooks().versionCuDNNFrontend();
-  }
  static bool hasCuSOLVER() {
    return detail::getCUDAHooks().hasCuSOLVER();
  }
--- a/aten/src/ATen/DeviceAccelerator.h
+++ b/aten/src/ATen/DeviceAccelerator.h
@ -94,11 +94,6 @@ TORCH_API inline void resetPeakStats(c10::DeviceIndex device_index) {
  at::getDeviceAllocator(device_type)->resetPeakStats(device_index);
 }

-TORCH_API inline std::pair<size_t, size_t> getMemoryInfo(
-    c10::DeviceIndex device_index) {
-  const auto device_type = getAccelerator(true).value();
-  return at::getDeviceAllocator(device_type)->getMemoryInfo(device_index);
-}
 } // namespace at::accelerator

 namespace at {
--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@ -6,7 +6,6 @@
 #include <c10/util/Half.h>
 #include <c10/util/Metaprogramming.h>
 #include <c10/util/complex.h>
-#include <torch/headeronly/core/Dispatch.h>

 #ifdef __CUDACC__
 #include <cuda.h> // For CUDA_VERSION
@ -62,9 +61,12 @@ TORCH_API void record_kernel_function_dtype(std::string name);
    }                                                 \
  } while (0)

-#define AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, HINT, ...) \
-  THO_PRIVATE_CASE_TYPE_USING_HINT_TMPL(                      \
-      AT_PRIVATE_CHECK_SELECTIVE_BUILD, enum_type, HINT, __VA_ARGS__)
+#define AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, HINT, ...)                 \
+  case enum_type: {                                                           \
+    AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type);                              \
+    using HINT [[maybe_unused]] = c10::impl::ScalarTypeToCPPTypeT<enum_type>; \
+    return __VA_ARGS__();                                                     \
+  }

 #define AT_DISPATCH_CASE(enum_type, ...) \
  AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, scalar_t, __VA_ARGS__)
@ -93,6 +95,14 @@ TORCH_API void record_kernel_function_dtype(std::string name);
    return __VA_ARGS__();                                                   \
  }

+namespace detail {
+
+inline at::ScalarType scalar_type(at::ScalarType s) {
+  return s;
+}
+
+} // namespace detail
+
 // The AT_DISPATCH_* family of macros provides the ability to
 // conveniently generate specializations of a kernel over all of the
 // dtypes we care about in PyTorch.  We call it "dispatch" because
@ -180,13 +190,27 @@ TORCH_API void record_kernel_function_dtype(std::string name);
 // but we're just being safe (and it doesn't hurt.)  Note we must
 // use it to shut up warnings about unused store.

-#define AT_DISPATCH_SWITCH(TYPE, NAME, ...) \
-  THO_DISPATCH_SWITCH_TMPL(                 \
-      RECORD_KERNEL_FUNCTION_DTYPE,         \
-      TORCH_CHECK_NOT_IMPLEMENTED,          \
-      TYPE,                                 \
-      NAME,                                 \
-      __VA_ARGS__)
+#define AT_DISPATCH_SWITCH(TYPE, NAME, ...)                                 \
+  [&] {                                                                     \
+    const auto& the_type = TYPE;                                            \
+    constexpr const char* at_dispatch_name = NAME;                          \
+    /* don't use TYPE again in case it is an expensive or side-effect op */ \
+    at::ScalarType _st = ::detail::scalar_type(the_type);                   \
+    RECORD_KERNEL_FUNCTION_DTYPE(at_dispatch_name, _st);                    \
+    C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum")             \
+    switch (_st) {                                                          \
+      __VA_ARGS__                                                           \
+      default:                                                              \
+        TORCH_CHECK_NOT_IMPLEMENTED(                                        \
+            false,                                                          \
+            '"',                                                            \
+            at_dispatch_name,                                               \
+            "\" not implemented for '",                                     \
+            toString(_st),                                                  \
+            "'");                                                           \
+    }                                                                       \
+    C10_DIAGNOSTIC_POP()                                                    \
+  }()

 #define AT_DISPATCH_CASE_FLOATING_TYPES(...)            \
  AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \
--- a/aten/src/ATen/Dispatch_v2.h
+++ b/aten/src/ATen/Dispatch_v2.h
@ -1,8 +1,3 @@
-#pragma once
-
-#include <torch/headeronly/core/Dispatch_v2.h>
-
-// Get AT_DISPATCH_SWITCH and AT_DISPATCH_CASE:
 #include <ATen/Dispatch.h>

 // This is a new implementation of the AT_DISPATCH macro family from
@ -79,19 +74,41 @@
 // macro expansion occurs, mediated with AT_EXPAND and AT_GUARD.  I mostly
 // relied on GPT4 to help me get it right.

+// Public API macros
+
 // See documentation above
 #define AT_DISPATCH_V2(TYPE, NAME, BODY, ...) \
-  THO_DISPATCH_V2_TMPL(                       \
-      AT_DISPATCH_SWITCH,                     \
-      AT_DISPATCH_CASE,                       \
-      TYPE,                                   \
-      NAME,                                   \
-      AT_WRAP(BODY),                          \
-      __VA_ARGS__)
+  AT_DISPATCH_SWITCH(TYPE, NAME, AT_AP_VAR(AT_WRAP(BODY), TYPE, __VA_ARGS__))
+
+// This macro lets you pass an arbitrary expression that may contain internal
+// commas to another macro without having the commas causing the expression
+// to be interpreted as being multiple arguments
+#define AT_WRAP(...) __VA_ARGS__
+
+#define AT_FLOAT8_TYPES                                          \
+  c10::kFloat8_e5m2, c10::kFloat8_e5m2fnuz, c10::kFloat8_e4m3fn, \
+      c10::kFloat8_e4m3fnuz, c10::kFloat8_e8m0fnu
+
+#define AT_INTEGRAL_TYPES \
+  c10::kByte, c10::kChar, c10::kInt, c10::kLong, c10::kShort
+#define AT_FLOATING_TYPES c10::kDouble, c10::kFloat
+#define AT_BAREBONES_UNSIGNED_TYPES c10::kUInt16, c10::kUInt32, c10::kUInt64
+#define AT_INTEGRAL_TYPES_V2 \
+  AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)
+#define AT_COMPLEX_TYPES c10::kComplexDouble, c10::kComplexFloat
+#define AT_QINT_TYPES c10::kQInt8, c10::kQUInt8, c10::kQInt32
+// NB: not *actually* all types
+#define AT_ALL_TYPES AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_FLOATING_TYPES)
+#define AT_ALL_TYPES_AND_COMPLEX \
+  AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_COMPLEX_TYPES)
+
+// Helper macros

-// Unused helper macros, kept for BC:
 #define AT_AP_VAR(N, T, ...) \
  AT_EXPAND(AT_CONCAT(AT_AP, AT_NUM_ARGS(__VA_ARGS__))(AT_WRAP(N), __VA_ARGS__))
+#define AT_CONCAT(a, b) AT_CONCAT_AUX(a, b)
+#define AT_CONCAT_AUX(a, b) a##b
+#define AT_EXPAND(X) X

 // Ensure we never have too many scalar types for the expansion here to
 // support.  To bump this, you must regenerate the macros below.
@ -102,6 +119,12 @@ static_assert(static_cast<int>(c10::ScalarType::NumOptions) < 60);

 num_args = 60

+nums = ', '.join(str(i) for i in reversed(range(num_args+1)))
+args = ', '.join(f'_{i}' for i in range(1, num_args+1))
+
+print(f'#define AT_NUM_ARGS(...) AT_EXPAND(AT_NUM_ARGS_AUX(__VA_ARGS__, {nums}))')
+print(f'#define AT_NUM_ARGS_AUX({args}, N, ...) N')
+
 for i in range(1, num_args+1):
    args = ', '.join(f'_{i}' for i in range(1, i+1))
    cases = ' '.join([f'AT_DISPATCH_CASE(_{j}, N)' for j in range(1, i+1)])
@ -112,6 +135,8 @@ for i in range(1, num_args+1):
 // Begin generated code
 // clang-format off

+#define AT_NUM_ARGS(...) AT_EXPAND(AT_NUM_ARGS_AUX(__VA_ARGS__, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0))
+#define AT_NUM_ARGS_AUX(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58, _59, _60, N, ...) N
 #define AT_AP1(N, _1) AT_DISPATCH_CASE(_1, N)
 #define AT_AP2(N, _1, _2) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N)
 #define AT_AP3(N, _1, _2, _3) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N)
--- a/aten/src/ATen/core/CachingHostAllocator.h
+++ b/aten/src/ATen/core/CachingHostAllocator.h
@ -226,8 +226,8 @@ template <
    typename B = HostBlock<S>>
 struct CachingHostAllocatorImpl {
  virtual ~CachingHostAllocatorImpl() {
-    if (active_) {
-      active_ = false;
+    active_ = false;
+    if (pinned_use_background_threads()) {
      getBackgroundThreadPool()->waitWorkComplete();
    }
  }
@ -260,7 +260,6 @@ struct CachingHostAllocatorImpl {
    if (pinned_use_background_threads()) {
      // Launch the background thread and process events in a loop.
      static bool background_thread_flag [[maybe_unused]] = [this] {
-        active_ = true;
        getBackgroundThreadPool()->run([&]() {
          while (active_) {
            process_events();
@ -684,9 +683,9 @@ struct CachingHostAllocatorImpl {
  alignas(hardware_destructive_interference_size) std::mutex events_mutex_;
  std::deque<std::pair<E, B*>> events_; // event queue paired with block

-  // Indicates whether the event-processing thread pool is active.
+  // Indicates whether the object is active.
  // Set to false in the destructor to signal background threads to stop.
-  std::atomic<bool> active_{false};
+  std::atomic<bool> active_{true};
 protected:
  alignas(hardware_destructive_interference_size) HostStatsStaged stats_;
 };
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -388,7 +388,6 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
 #ifndef USE_ROCM
  at::Half halpha;
  at::Half hbeta;
-  uint32_t mask = -1;
 #endif
  void * alpha_ptr = &alpha;
  void * beta_ptr = &beta;
@ -428,7 +427,7 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
    auto fp16_reduction = at::globalContext().allowFP16ReductionCuBLAS();
    if (fp16_reduction !=
        at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
-      mask =
+      uint32_t mask =
          fp16_reduction ==
                  at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
              ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
@ -445,7 +444,7 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
    auto bf16_reduction = at::globalContext().allowBF16ReductionCuBLAS();
    if (bf16_reduction !=
        at::CuBLASReductionOption::AllowReducedPrecisionWithSplitK) {
-      mask =
+      uint32_t mask =
          bf16_reduction ==
                  at::CuBLASReductionOption::DisallowReducedPrecisionAllowSplitK
              ? (CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE |
@ -512,41 +511,17 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
  cublasStatus_t cublasStatus = CUBLAS_STATUS_SUCCESS;
  cublasLtMatmulHeuristicResult_t heuristicResult = {};
  int returnedResult = 0;
-  // on Blackwell+, we fake a n > 1 matmul when querying heuristics
-  // to prevent cuBLASLt from dispatching to a GEMV kernel for batch-invariance
-#ifndef USE_ROCM
-  const bool lie_to_cublaslt = mask == CUBLASLT_REDUCTION_SCHEME_NONE && n == 1 && at::cuda::getCurrentDeviceProperties()->major >= 10;
-#else
-  const bool lie_to_cublaslt = false;
-#endif
-  if (lie_to_cublaslt) {
-     CuBlasLtMatrixLayout FakeBdesc(abType, k, 2, ldb, opb == CUBLAS_OP_T);
-     CuBlasLtMatrixLayout FakeCdesc(cType, m, 2, ldc);
-
-     TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
-        ltHandle,
-        computeDesc.descriptor(),
-        Adesc.descriptor(),
-        FakeBdesc.descriptor(),
-        FakeCdesc.descriptor(),
-        FakeCdesc.descriptor(),
-        preference.descriptor(),
-        1,
-        &heuristicResult,
-        &returnedResult));
-  } else {
-    TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
-        ltHandle,
-        computeDesc.descriptor(),
-        Adesc.descriptor(),
-        Bdesc.descriptor(),
-        Cdesc.descriptor(),
-        Cdesc.descriptor(),
-        preference.descriptor(),
-        1,
-        &heuristicResult,
-        &returnedResult));
-  }
+  TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
+      ltHandle,
+      computeDesc.descriptor(),
+      Adesc.descriptor(),
+      Bdesc.descriptor(),
+      Cdesc.descriptor(),
+      Cdesc.descriptor(),
+      preference.descriptor(),
+      1,
+      &heuristicResult,
+      &returnedResult));
  if (returnedResult == 0) {
    cublasStatus = CUBLAS_STATUS_NOT_SUPPORTED;
  }
@ -1597,7 +1572,7 @@ bool gemm_and_bias(
  }

  using opmath_t = at::opmath_type<Dtype>;
-  opmath_t beta_val = bias ? 0 : 1; // bias is added in epilogue unless nullptr
+  opmath_t beta_val = 0; // bias is added in epilogue

  cudaDataType_t abType = CUDA_R_32F;
  cudaDataType_t cType = CUDA_R_32F;
@ -1686,22 +1661,15 @@ bool gemm_and_bias(
    _syncCurrentWithCarveoutStream(stream, true);
  }
 #endif
-  const auto epilogue = [&]() -> cublasLtEpilogue_t {
-    // The cuBLAS documentation indicates that
-    // *_<ACTIVATION>_BIAS = *_<ACTIVATION>,
-    // but we keep it verbose here for clarity.
-    switch (activation) {
-      case GEMMAndBiasActivationEpilogue::RELU:
-        return bias ? CUBLASLT_EPILOGUE_RELU_BIAS : CUBLASLT_EPILOGUE_RELU;
-      case GEMMAndBiasActivationEpilogue::GELU:
-        return bias ? CUBLASLT_EPILOGUE_GELU_BIAS : CUBLASLT_EPILOGUE_GELU;
-      default:
-        return bias ? CUBLASLT_EPILOGUE_BIAS : CUBLASLT_EPILOGUE_DEFAULT;
-    }
-  }();
-  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, epilogue);
+  cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_BIAS;
+  if (activation == GEMMAndBiasActivationEpilogue::RELU) {
+    epilogue = CUBLASLT_EPILOGUE_RELU_BIAS;
+  } else if (activation == GEMMAndBiasActivationEpilogue::GELU) {
+    epilogue = CUBLASLT_EPILOGUE_GELU_BIAS;
+  }

-  if (bias) {
+  if (bias != nullptr) {
+    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, epilogue);
    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_POINTER, bias);
  }

--- a/aten/src/ATen/cuda/cub.h
+++ b/aten/src/ATen/cuda/cub.h
@ -24,13 +24,7 @@ namespace detail {
 // radix_sort_pairs doesn't interact with value_t other than to copy
 // the data, so we can save template instantiations by reinterpreting
 // it as an opaque type.
-// We use native integer types for 1/2/4/8-byte values to reduce
-// register usage in CUDA kernels. For sizes > 8 fall back to char array.
 template <int N> struct alignas(N) OpaqueType { char data[N]; };
-template <> struct alignas(1) OpaqueType<1> { uint8_t data; };
-template <> struct alignas(2) OpaqueType<2> { uint16_t data; };
-template <> struct alignas(4) OpaqueType<4> { uint32_t data; };
-template <> struct alignas(8) OpaqueType<8> { uint64_t data; };

 template<typename key_t, int value_size>
 void radix_sort_pairs_impl(
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@ -21,7 +21,6 @@

 #if AT_CUDNN_ENABLED()
 #include <ATen/cudnn/cudnn-wrapper.h>
-#include <cudnn_frontend.h>
 #endif

 #if AT_MAGMA_ENABLED()
@ -352,26 +351,6 @@ long CUDAHooks::versionCuDNN() const {
 #endif
 }

-long CUDAHooks::versionRuntimeCuDNN() const {
-#if AT_CUDNN_ENABLED()
-#ifndef USE_STATIC_CUDNN
-  return cudnnGetVersion();
-#else
-  return CUDNN_VERSION;
-#endif
-#else
-  TORCH_CHECK(false, "Cannot query CuDNN version if ATen_cuda is not built with CuDNN");
-#endif
-}
-
-long CUDAHooks::versionCuDNNFrontend() const {
-#if AT_CUDNN_ENABLED()
-  return CUDNN_FRONTEND_VERSION;
-#else
-  TORCH_CHECK(false, "Cannot query CuDNN Frontend version if ATen_cuda is not built with CuDNN");
-#endif
-}
-
 long CUDAHooks::versionMIOpen() const {
 #if AT_ROCM_ENABLED()
  return MIOPEN_VERSION_MAJOR * 10000 +
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@ -49,8 +49,6 @@ struct CUDAHooks : public at::CUDAHooksInterface {
  bool hasCUDART() const override;
  long versionCUDART() const override;
  long versionCuDNN() const override;
-  long versionRuntimeCuDNN() const override;
-  long versionCuDNNFrontend() const override;
  long versionMIOpen() const override;
  std::string showConfig() const override;
  double batchnormMinEpsilonCuDNN() const override;
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@ -174,14 +174,6 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
    TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
  }

-  virtual long versionRuntimeCuDNN() const {
-    TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
-  }
-
-  virtual long versionCuDNNFrontend() const {
-    TORCH_CHECK(false, "Cannot query cuDNN Frontend version without ATen_cuda library. ", CUDA_HELP);
-  }
-
  virtual long versionMIOpen() const {
    TORCH_CHECK(false, "Cannot query MIOpen version without ATen_cuda library. ", CUDA_HELP);
  }
--- a/aten/src/ATen/functorch/BatchedTensorImpl.h
+++ b/aten/src/ATen/functorch/BatchedTensorImpl.h
@ -157,8 +157,6 @@ constexpr DispatchKeySet kKeysToPropagateToWrapper({
  DispatchKey::Negative,
  DispatchKey::Conjugate,
  DispatchKey::XLA,
-  DispatchKey::XPU,
-  DispatchKey::HPU,
  DispatchKey::CUDA,
  DispatchKey::CPU,
  DispatchKey::PrivateUse1,
--- a/aten/src/ATen/native/BinaryOps.cpp
+++ b/aten/src/ATen/native/BinaryOps.cpp
@ -1009,25 +1009,12 @@ static Device correct_out_device(const Tensor& self, const Tensor& other) {
  }
 }

-static Tensor send_to_meta(const Tensor& self, const Device& device) {
-  Tensor out_meta;
-  if (self._is_zerotensor() && self.unsafeGetTensorImpl()->is_wrapped_number()) {
-    out_meta = at::_efficientzerotensor(self.sizes(), self.options().device(device));
-    out_meta.unsafeGetTensorImpl()->set_wrapped_number(true);
-  } else {
-    out_meta = self.to(device);
-  }
-  return out_meta;
-}
-
 Tensor mul_zerotensor(const Tensor& self, const Tensor& other) {
  auto out_device = correct_out_device(self, other);
  // hack to use the TensorIterator to get the correct broadcasting and type promotion logic
  auto device_ = Device(DeviceType::Meta);
  constexpr c10::DispatchKeySet meta_dks(at::DispatchKey::Meta);
-  auto self_meta = send_to_meta(self, device_);
-  auto other_meta = send_to_meta(other, device_);
-  auto meta_out = at::_ops::mul_Tensor::redispatch(meta_dks, self_meta, other_meta);
+  auto meta_out = at::_ops::mul_Tensor::redispatch(meta_dks, self.to(device_), other.to(device_));
  return at::_efficientzerotensor(meta_out.sizes(), meta_out.options().device(out_device));
 }

@ -1036,9 +1023,7 @@ Tensor div_zerotensor(const Tensor& self, const Tensor& other) {
  // hack to use the TensorIterator to get the correct broadcasting and type promotion logic
  auto device_ = Device(DeviceType::Meta);
  constexpr c10::DispatchKeySet meta_dks(at::DispatchKey::Meta);
-  auto self_meta = send_to_meta(self, device_);
-  auto other_meta = send_to_meta(other, device_);
-  auto meta_out = at::_ops::div_Tensor::redispatch(meta_dks, self_meta, other_meta);
+  auto meta_out = at::_ops::div_Tensor::redispatch(meta_dks, self.to(device_), other.to(device_));

  if (self._is_zerotensor()) {
    if (other._is_zerotensor()) {
@ -1067,9 +1052,8 @@ static Tensor maybe_add_maybe_sub(const Tensor& self, const Tensor& other, const
  // hack to use the TensorIterator to get the correct broadcasting and type promotion logic
  auto device_ = Device(DeviceType::Meta);
  constexpr c10::DispatchKeySet meta_dks(at::DispatchKey::Meta);
-  auto self_meta = send_to_meta(self, device_);
-  auto other_meta = send_to_meta(other, device_);
-  auto meta_out = at::_ops::add_Tensor::redispatch(meta_dks, self_meta, other_meta, alpha);
+  auto meta_out = at::_ops::add_Tensor::redispatch(
+      meta_dks, self.to(device_), other.to(device_), alpha);

  auto get_out_like = [&] (const Tensor& tensor)
  {
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -409,7 +409,7 @@ struct ConvParams {
    if (!detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda() || !cudnn_enabled) {
      return false;
    }
-    static long cudnn_version = detail::getCUDAHooks().versionRuntimeCuDNN();
+    static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
    // broken on cuDNN 9.8 - 9.14
    if (cudnn_version >= 90800 && cudnn_version < 91500) {
      if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous &&
@ -453,7 +453,7 @@ struct ConvParams {
    }
    // native kernel doesn't support 64-bit non-splittable case
    if (!(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) {
-      static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionRuntimeCuDNN() : -1;
+      static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionCuDNN() : -1;
      // TODO(eqy): remove this once cuDNN fixes 64-bit depthwise support, first broken in 9.11x
      if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) {
        if (cudnn_version < 0 || cudnn_version > 91000) {
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@ -50,35 +50,18 @@ static inline bool parseLinearFlatten3d() {
 // `_flatten_nd_linear` flattens all but the last dimension of the input tensor
 // before passing it to linear operation
 static inline Tensor _flatten_nd_linear(const Tensor& input, const Tensor& weight, const Tensor& bias) {
-  const auto input_sizes = input.sym_sizes();
-
-  const auto result_flattened = [&]() -> Tensor {
-    const auto input_ncols = input_sizes.back();
-    const auto input_flattened_nrows = [&]() -> c10::SymInt {
-      // can't use -1 in reshape because it errors when a dimension is 0
-      auto flattened_nrows = c10::SymInt{1};
-      for (const auto& size : input_sizes.slice(0, input_sizes.size() - 1)) {
-        flattened_nrows *= size;
-      }
-      return flattened_nrows;
-    }();
-
-    const auto input_flattened = input.view_symint({input_flattened_nrows, input_ncols});
-    if (weight.layout() == c10::kStrided) {
-      return at::addmm(bias, input_flattened, weight.t());
-    } else {
-      // weight is sparse, and addmm for sparse expects matmul lhs to be sparse,
-      // so we transpose the problem.
-      // NOTE: at::matmul handles (dense @ sparse) similarly.
-      const auto bias_t = (bias.dim() >= 2) ? bias.mT() : bias.unsqueeze(-1);
-      return at::addmm(bias_t, weight, input_flattened.t()).t();
+    const auto input_sizes = input.sym_sizes();
+    // can't use -1 in reshape because it errors when a dimension is 0
+    c10::SymInt flattened_dim = 1;
+    for (int64_t i = 0, ndim = input_sizes.size(); i < ndim - 1; ++i) {
+      flattened_dim = flattened_dim * input_sizes[i];
    }
-  }();
-
-  // Unflatten flattened row dims
-  auto result_sizes = c10::SymDimVector{input_sizes.begin(), input_sizes.end()};
-  result_sizes.back() = result_flattened.sym_size(1);
-  return result_flattened.view_symint(result_sizes);
+    auto inp_reshape = input.reshape_symint({flattened_dim, input_sizes.at(input_sizes.size() -1)});
+    const auto result = at::addmm(bias, inp_reshape, weight.t());
+    auto new_size = input_sizes.slice(0, input_sizes.size() - 1);
+    c10::SymDimVector sizes_vec(new_size.begin(), new_size.end());
+    sizes_vec.push_back(result.sym_size(1));
+    return result.view_symint(sizes_vec);
 }


@ -107,23 +90,15 @@ Tensor linear(const Tensor& input, const Tensor& weight, const std::optional<Ten
    // Fused op is marginally faster.
    return at::addmm(*bias, input, weight.t());
  }
-
-  const auto is_bias_likely_fusable = (
-      bias->defined() &&
-      // cuBLASLt: will fuse in the epilogue without copies
-      // when input/weight/bias are all strided.
-      // When weight is not strided, bias will not be fused,
-      // but we can still dispatch here to avoid at::matmul
-      // path which will probably use a very similar
-      // flattening optimization.
-      ((bias->dim() == 1 || bias->squeeze().dim() == 1) && bias->is_contiguous_or_false())
-  );
-  if (is_bias_likely_fusable && !input.is_xla()) {
-    // Also hit the fused path for contiguous nD input, if not using xla
+  if (bias->defined() && !input.is_xla()) {
+    // Also hit the fused path for contiguous 3D input, if not using xla
    // backend. Reshaping/flattening has some performance implications on xla.
-    if (input.is_contiguous_or_false()) {
+    bool is_contiguous = input.is_contiguous_or_false();
+    if (is_contiguous && input_dim == 3) {
      return _flatten_nd_linear(input, weight, *bias);
-    } else if (parseLinearFlatten3d()) {
+    } else if (is_contiguous && input.layout() == c10::kStrided && weight.layout() == c10::kStrided && bias->dim() == 1) {
+      return _flatten_nd_linear(input, weight, *bias);
+    } else if (parseLinearFlatten3d() && input_dim == 3) {
      // If user forces flattening via env var
      const Tensor input_cont = input.contiguous();
      return _flatten_nd_linear(input_cont, weight, *bias);
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@ -1,6 +1,5 @@
 #include <ATen/core/ATen_fwd.h>
 #include <c10/core/ScalarType.h>
-#include <c10/core/SymInt.h>
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
 #include <ATen/AccumulateType.h>
 #include <ATen/Dispatch.h>
@ -1711,37 +1710,11 @@ Tensor narrow_symint(
      "], but got ",
      start,
      ")")
-
-  auto cond1 = TORCH_GUARD_OR_FALSE(start.sym_lt(0));
-  auto cond2 = TORCH_GUARD_OR_FALSE(start.sym_ge(0));
-
-  if (cond1 || cond2) {
-    if (cond1) {
-      start = start + cur_size;
-    }
-
-    TORCH_SYM_CHECK(
-        start.sym_le(cur_size - length),
-        "start (",
-        start,
-        ") + length (",
-        length,
-        ") exceeds dimension size (",
-        cur_size,
-        ").");
-    return at::slice_symint(self, dim, start, start + length, 1);
+  if (start < 0) {
+    start = start + cur_size;
  }
-
-  // Unbacked start handling!
-
-  // Bounds check without converting start:
-  // - If start < 0: need (start + cur_size) + length <= cur_size, i.e., start +
-  // length <= 0
-  // - If start >= 0: need start + length <= cur_size
-  auto end = start + length;
  TORCH_SYM_CHECK(
-      (start.sym_lt(0).sym_and((end).sym_le(0)))
-          .sym_or(start.sym_ge(0).sym_and((end).sym_le(cur_size))),
+      start.sym_le(cur_size - length),
      "start (",
      start,
      ") + length (",
@ -1749,28 +1722,7 @@ Tensor narrow_symint(
      ") exceeds dimension size (",
      cur_size,
      ").");
-
-  if (TORCH_GUARD_OR_FALSE(end.sym_ne(0))) {
-    return at::slice_symint(self, dim, start, end, 1);
-  } else {
-    // Cannot statically determine the condition due to unbacked.
-    // This is an interesting situation; when start is negative and
-    // start + length == 0, slice and narrow do different things.
-    // i.e., x.narrow(0, -2, 2) != x[-2:0]; in that case, we want to
-    // pass curr_size instead of 0. Otherwise, they would do the same thing.
-    // This says at runtime: if start < 0 and end == 0, then pass curr_size
-    // instead of 0.
-
-    auto use_different = start.sym_lt(0).sym_and(end.sym_eq(0)).toSymInt();
-    auto result =
-        at::slice_symint(self, dim, start, end + use_different * cur_size, 1);
-
-    // Ensure slice allocated unbacked size is specialized to length.
-    SymInt new_size = result.sym_size(dim);
-    TORCH_SYM_CHECK(new_size.sym_eq(length), "")
-
-    return result;
-  }
+  return at::slice_symint(self, dim, start, start + length, 1);
 }

 // This overload exists purely for XLA, because they wanted to pass in
@ -1784,8 +1736,8 @@ Tensor narrow_tensor_symint(
      start.dim() == 0 &&
          isIntegralType(start.scalar_type(), /*includeBool=*/false),
      "start must be an 0-dim integral Tensor.");
-  c10::SymInt st = start.item().toSymInt();
-  return at::narrow_symint(self, dim, std::move(st), std::move(length));
+  int64_t st = start.item<int64_t>();
+  return at::narrow_symint(self, dim, c10::SymInt(st), std::move(length));
 }

 std::
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -147,24 +147,14 @@ static bool isGloballyDisabledAddmmCudaLt(const at::Device& device) {
 /*
 * Check whether for the given input we want to enable the Lt interface
 */
-static bool isInputCompliesAddmmCudaLt(
-    Tensor& result,
-    const Tensor& self,
-    const Tensor& mat1,
-    const Tensor& mat2,
-    const Scalar& beta,
-    const Scalar& alpha,
-    Activation activation
-) {
-  #ifdef USE_ROCM
+static bool isInputCompliesAddmmCudaLt(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha) {
  // Implies 2D bias which we currently not send through Lt.
  // TODO: this check is done pre col-major input preparation,
  // so, this condition can be ralexed in cases when a col-major
  // copy of result is needed.
-  if (self.is_same(result) || self.dim() == 2) {
+  if (result.is_same(self)) {
    return false;
  }
-  #endif

  #if defined(USE_ROCM) && ROCM_VERSION == 60400
  // hipblaslt TT fp32 regression on ROCm 6.4, cannot use
@ -179,33 +169,13 @@ static bool isInputCompliesAddmmCudaLt(
  #if defined(CUDA_VERSION) || defined(USE_ROCM)
  const auto scalar_type = mat1.scalar_type();
  return (beta.toComplexDouble() == 1.0
-    // NOTE: row-major result is important when bias is 1D.
-    // This is because Lt broadcasts 1D bias over the columns
-    // while the aten::addmm API broadcasts it over the rows,
-    // and this is in conjuction with the data preparation
-    // procedure that does not transpose arguments with
-    // col-major result. For col-major result we need
-    // to explicitly transpose the problem so that bias is
-    // correctly applied.
-    // TODO: enable col-major result if needed.
-    // TODO: no need to check result's layout when
-    // !result.is_same(self) and self.dim() == 2, because
-    // self needs to be copied into result and the bias ptr
-    // will be ignored.
    && result.dim() == 2 && result.is_contiguous()
+    // Conditions for bias to be fusable
    && (
-      ( // Conditions for bias to be fusable -- implies direct Lt path without copies.
-        self.is_contiguous() &&
-        // NOTE: fine to have 1-len dims to the left from the right-most one
-        (self.dim() == 1 || self.squeeze().dim() == 1) &&
-        self.sizes().back() == mat2_sizes[1]
-      )
-      || ( // 2D bias restrictions. self.is_contiguous() is implicit when result.is_same(self),
-        // and we need to copy self into result otherwise, so the self's layout becomes irrelevant.
-        // See also TODO from above.
-        activation != Activation::None && // Lt is faster when activation is fused
-        (self.dim() == 2 && at::is_expandable_to(self.sizes(), {mat1_sizes[0], mat2_sizes[1]}))
-      )
+      self.is_contiguous() &&
+      // NOTE: fine to have 1-len dims to the left from the right-most one
+      (self.dim() == 1 || self.squeeze().dim() == 1) &&
+      self.sizes().back() == mat2_sizes[1]
    )
    && ( // some dtype restrictions
      #ifndef USE_ROCM
@ -300,16 +270,7 @@ bool launchGemmAndBiasCublasLt(
    const Scalar& alpha,
    Activation activation = Activation::None
 ) {
-  // We apply bias in the epilogue only when it is 1D,
-  // or when it can be squeezed to 1D.
-  // self_ptr == nullptr implies ignore bias epilogue
-  // and use standard gemm-like API.
-  const auto* self_ptr = [&]() -> auto {
-    if (self.dim() == 1 || self.squeeze().dim() == 1) {
-      return self.const_data_ptr<scalar_t>();
-    }
-    return static_cast<const scalar_t*>(nullptr);
-  }();
+  const auto* self_ptr = self.const_data_ptr<scalar_t>();

  const auto tuning_ctx = at::cuda::tunable::getTuningContext();
  if (tuning_ctx->IsTunableOpEnabled()) {
@ -395,7 +356,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
  disable_addmm_cuda_lt = isGloballyDisabledAddmmCudaLt(self.device()) || disable_addmm_cuda_lt;
  #endif
  // Condition on the input
-  disable_addmm_cuda_lt = !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha, activation) || disable_addmm_cuda_lt;
+  disable_addmm_cuda_lt = !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha) || disable_addmm_cuda_lt;
  // }

  at::ScalarType scalar_type = mat1.scalar_type();
@ -405,20 +366,19 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
  if (!result.is_same(self)) {
    at::native::resize_output(result, {mat1.sizes()[0], mat2.sizes()[1]});

-    // We use bias ptr in the Lt path only when bias is 1D
-    const auto use_bias_ptr_lt = (self.dim() == 1) && !disable_addmm_cuda_lt;
    const auto self_maybe_expanded = [&]() -> c10::MaybeOwned<Tensor> {
-      if (!use_bias_ptr_lt) {
-        // We do expand self even before
+      if (disable_addmm_cuda_lt) {
+        // When in non-Lt path we do expand self even before
        // check for beta != 0.0 to make sure that
        // test_sparse_csr.py::TestSparseCSRCUDA::test_addmm_errors_*
        // runs green.
        return expand_size(self, result.sizes(), "addmm");
      }
+      // copy next, should broadcast
      return c10::MaybeOwned<Tensor>::borrowed(self);
    }();
-    // We do not copy bias only when we need the bias ptr
-    if (beta.toComplexDouble() != 0.0 && !use_bias_ptr_lt) {
+    // We copy bias when in the non-Lt path
+    if (beta.toComplexDouble() != 0.0 && disable_addmm_cuda_lt) {
      // NOTE: self should broadcast over result
      at::native::copy_(result, *self_maybe_expanded);
    }
--- a/aten/src/ATen/native/cuda/CUDALoops.cuh
+++ b/aten/src/ATen/native/cuda/CUDALoops.cuh
@ -884,69 +884,6 @@ struct type_specialized_kernel_launcher {
  }
 };

-template <int arg_index>
-struct type_specialized_broadcast_kernel_launcher {
-  template <
-      typename func_t,
-      typename array_t,
-      typename dtypes_t,
-      typename calc_t>
-  static void apply(
-      int64_t numel,
-      func_t f,
-      array_t data,
-      dtypes_t dtypes,
-      calc_t offset_calc) {
-        using traits = function_traits<func_t>;
-        using ret_t = typename traits::result_type;
-        using arg0_t = typename traits::template arg<0>::type;
-        using arg1_t = typename traits::template arg<1>::type;
-        if (dtypes[0] == rt_binary_specializations[arg_index][0] &&
-          dtypes[1] == rt_binary_specializations[arg_index][1] &&
-          dtypes[2] == rt_binary_specializations[arg_index][2]) {
-            using ret_cpp_t = c10::impl::ScalarTypeToCPPTypeT<rt_binary_specializations[arg_index][0]>;
-            using arg0_cpp_t = c10::impl::ScalarTypeToCPPTypeT<rt_binary_specializations[arg_index][1]>;
-            using arg1_cpp_t = c10::impl::ScalarTypeToCPPTypeT<rt_binary_specializations[arg_index][2]>;
-            constexpr int grp_sz = 128;
-            launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
-              if (unrl) {
-                auto offsets0 = offset_calc.get(idx);
-                auto offsets1 = offset_calc.get(idx + grp_sz);
-                auto offsets2 = offset_calc.get(idx + grp_sz * 2);
-                auto offsets3 = offset_calc.get(idx + grp_sz * 3);
-                void* out0 = data[0] + offsets0[0];
-                void* out1 = data[0] + offsets1[0];
-                void* out2 = data[0] + offsets2[0];
-                void* out3 = data[0] + offsets3[0];
-                auto u = c10::load<arg0_cpp_t>(data[1] + offsets0[1]);
-                auto v = c10::load<arg1_cpp_t>(data[2] + offsets0[2]);
-                ret_t result0 = f(c10::convert<arg0_t>(u), c10::convert<arg1_t>(v));
-                auto u1 = c10::load<arg0_cpp_t>(data[1] + offsets1[1]);
-                auto v1 = c10::load<arg1_cpp_t>(data[2]+ offsets1[2]);
-                ret_t result1 = f(c10::convert<arg0_t>(u1), c10::convert<arg1_t>(v1));
-                auto u2 = c10::load<arg0_cpp_t>(data[1] + offsets2[1]);
-                auto v2 = c10::load<arg1_cpp_t>(data[2] + offsets2[2]);
-                ret_t result2 = f(c10::convert<arg0_t>(u2), c10::convert<arg1_t>(v2));
-                auto u3 = c10::load<arg0_cpp_t>(data[1] + offsets3[1]);
-                auto v3 = c10::load<arg1_cpp_t>(data[2] + offsets3[2]);
-                ret_t result3 = f(c10::convert<arg0_t>(u3), c10::convert<arg1_t>(v3));
-                *(ret_cpp_t*)out0 = c10::convert<ret_cpp_t>(result0);
-                *(ret_cpp_t*)out1 = c10::convert<ret_cpp_t>(result1);
-                *(ret_cpp_t*)out2 = c10::convert<ret_cpp_t>(result2);
-                *(ret_cpp_t*)out3 = c10::convert<ret_cpp_t>(result3);
-              } else {
-                auto offsets = offset_calc.get(idx);
-                void* out = data[0] + offsets[0];
-                auto u = c10::load<arg0_cpp_t>(data[1] + offsets[1]);
-                auto v = c10::load<arg1_cpp_t>(data[2] + offsets[2]);
-                ret_t result = f(c10::convert<arg0_t>(u), c10::convert<arg1_t>(v));
-                *(ret_cpp_t*)out = c10::convert<ret_cpp_t>(result);
-              }
-            });
-        }
-      }
-};
-
 } // namespace
 #endif

@ -1065,32 +1002,6 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
    }
    auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter);
 #ifdef USE_ROCM
-    if (check_binary_rt_types_for_specialization(iter)) {
-      // constexpr to reduce the amount of kernels generated for
-      // broadcast elementwise with mexed dtypes and limit which functors are actually
-      // applied to the load and store at compile time.
-      using func_tuple = typename traits::ArgsTuple;
-      if constexpr (
-        std::is_same_v<float, arg0_t> && traits::arity == 2 &&
-        check_binary_functor_types_for_specialization<
-          func_tuple,
-          float,
-          float,
-          traits::arity,
-          /*arg_num=*/0>::check()) {
-            memory::detail::static_unroll<
-              type_specialized_broadcast_kernel_launcher,
-              rt_binary_specializations.size()>::with_args(
-                numel,
-                f,
-                data,
-                dtypes,
-                offset_calc
-            );
-            return;
-      }
-    }
-
    constexpr int grp_sz = 128;
    launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
      if (unrl) {
--- a/aten/src/ATen/native/cuda/GroupedBlas.cpp
+++ b/aten/src/ATen/native/cuda/GroupedBlas.cpp
@ -22,9 +22,6 @@
 #include <ATen/native/cuda/RowwiseScaledMM.h>
 #include <ATen/native/cuda/ScaledGroupMM.h>
 #include <ATen/native/cuda/GroupMM.h>
-#ifdef USE_ROCM
-#include <ATen/native/hip/ck_group_gemm.h>
-#endif
 #include <ATen/ceil_div.h>

 #ifdef USE_FBGEMM_GENAI
@ -669,19 +666,12 @@ std::optional<c10::ScalarType> out_dtype) {
  // _scaled_mm_allowed_device is used here within _grouped_mm_cuda which seems incorrect since scale is not used.
  // the _grouped_mm_fallback should be safe for any ROCm GPU since it's just calling typical mm/bmm
  bool use_fast_path = false;
-  if (at::detail::getCUDAHooks().isGPUArch({"gfx942", "gfx950"})) {
-    use_fast_path = true;
-  }
 #endif
  const auto out_dtype_ = _resolve_grouped_mm_out_dtype(mat_a, mat_b, out_dtype);
  Tensor out = create_grouped_gemm_output_tensor(mat_a, mat_b, offs, out_dtype_);
  if (use_fast_path) {
    // fast path, no d2h sync needed
-#ifndef USE_ROCM
    at::cuda::detail::bf16bf16_grouped_mm(mat_a, mat_b, offs, bias, out);
-#else
-    at::hip::detail::group_gemm_ck(mat_a, mat_b, offs, bias, out);
-#endif
  } else {
    _grouped_mm_fallback(mat_a, mat_b, offs, bias, out_dtype, out);
  }
--- a/aten/src/ATen/native/hip/ck_group_gemm.h
+++ b/aten/src/ATen/native/hip/ck_group_gemm.h
@ -1,19 +0,0 @@
-#pragma once
-
-#include <ATen/Tensor.h>
-#include <c10/core/ScalarType.h>
-#include <optional>
-
-namespace at {
-namespace hip {
-namespace detail {
-void group_gemm_ck(
-    const at::Tensor& mat_a,
-    const at::Tensor& mat_b,
-    const std::optional<at::Tensor>& offs,
-    const std::optional<at::Tensor>& bias,
-    at::Tensor& out);
-
-} // namespace detail
-} // namespace hip
-} // namespace at
--- a/aten/src/ATen/native/hip/ck_group_gemm.hip
+++ b/aten/src/ATen/native/hip/ck_group_gemm.hip
@ -1,462 +0,0 @@
-#undef __HIP_NO_HALF_CONVERSIONS__
-#include <ATen/hip/HIPContext.h>
-#include <ATen/Tensor.h>
-#include <ATen/TensorAccessor.h>
-#include <c10/hip/HIPStream.h>
-#include <iostream>
-#include <vector>
-#include <optional>
-#include <type_traits>
-
-#include <ck/ck.hpp>
-#include <ck/tensor_operation/gpu/device/tensor_layout.hpp>
-#include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
-#include <ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp>
-#include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
-#include <ck/utility/tuple.hpp>
-
-template <ck::index_t... Is>
-using S = ck::Sequence<Is...>;
-
-namespace at {
-namespace hip {
-namespace detail {
-
-namespace CkTypes {
-    using BF16 = ck::bhalf_t;
-    using F16 = ck::half_t;
-    using F32 = float;
-    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
-}
-
-template <typename ALayout, typename BLayout, typename DataType>
-using GroupedGemmKernel = ck::tensor_operation::device::DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage<
-    ALayout, BLayout, ck::Tuple<>, ck::tensor_layout::gemm::RowMajor,
-    DataType, DataType, CkTypes::F32, DataType, ck::Tuple<>, DataType,
-    CkTypes::PassThrough, CkTypes::PassThrough, CkTypes::PassThrough,
-    ck::tensor_operation::device::GemmSpecialization::MNKPadding,
-    1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2,
-    S<1,4,64,1>,  S<0,2,1,3>, S<0,2,1,3>,
-    3, 8, 8, 1,
-    S<1,4,64,1>,  S<0,2,1,3>, S<0,2,1,3>,
-    3, 8, 8, 1,
-    1, 1,
-    S<1,32,1,8>, 4
->;
-
-template <typename ALayout, typename BLayout, typename DataType>
-void launch_grouped_bgemm_ck_impl_dispatch(
-    const at::Tensor& mat_a,
-    const at::Tensor& mat_b,
-    const std::optional<at::Tensor>& offs,
-    at::Tensor& out)
-{
-    using DeviceOp = GroupedGemmKernel<ALayout, BLayout, DataType>;
-    using PassThrough = CkTypes::PassThrough;
-
-    std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
-    std::vector<const void*> p_a_ptrs, p_b_ptrs;
-    std::vector<void*> p_e_ptrs;
-    // Note: d_ptrs will be resized after we populate the other vectors
-
-    const int mat_a_dim = mat_a.dim();
-    const int mat_b_dim = mat_b.dim();
-
-    const char* a_ptr_base = reinterpret_cast<const char*>(mat_a.data_ptr());
-    const char* b_ptr_base = reinterpret_cast<const char*>(mat_b.data_ptr());
-    char* out_ptr_base = reinterpret_cast<char*>(out.data_ptr());
-    const size_t a_element_size = mat_a.element_size();
-    const size_t b_element_size = mat_b.element_size();
-    const size_t out_element_size = out.element_size();
-
-    // for each group, calculate m,n,k,lda,ldb,ldc and A,B,out pointer base addresses.
-    if (mat_a_dim == 2 && mat_b_dim == 2) {
-        // 2D*2D case requires offset tensor
-        auto offs_accessor = offs->accessor<int, 1>();
-        int num_groups = offs_accessor.size(0);
-        const int M = mat_a.size(0); // number of rows in A
-        const int N = mat_b.size(1); // number of columns in B
-        const int K = mat_a.size(1); // columns in A == rows in B
-        // for 2d*2d input, output is 3d.
-        // for each group, A columns (K) are sliced. M and N dimensions are not sliced.
-        for (int i = 0; i < num_groups; ++i) {
-            int start_k = (i == 0) ? 0 : offs_accessor[i-1];
-            int end_k = offs_accessor[i];
-            int k = end_k - start_k;
-
-            //K dimension are sliced, hence select stride(1) always.
-            //K dimension is always dimension 1, regardless of memory layout (row/column major)
-            const void* group_a_ptr = a_ptr_base + start_k * mat_a.stride(1) * a_element_size;
-            const void* group_b_ptr;
-            int ldb;
-
-            if (std::is_same<BLayout, ck::tensor_layout::gemm::RowMajor>::value) {
-                // Row-major B [K,N]: K values are horizontally adjacent, use stride(1) for K offset
-                group_b_ptr = b_ptr_base + start_k * mat_b.stride(1) * b_element_size;
-                // Leading dimension = distance between rows = stride(0)
-                ldb = mat_b.stride(0);
-            } else {
-                // Column-major B [K,N]: K values are vertically adjacent, use stride(0) for K offset
-                group_b_ptr = b_ptr_base + start_k * mat_b.stride(0) * b_element_size;
-                // Leading dimension = distance between columns = stride(1)
-                ldb = mat_b.stride(1);
-            }
-
-            // Calculate output pointer for group i in 3D tensor [num_groups, M, N]
-            // stride(0) = M*N elements between groups, so skip i*stride(0) elements to reach group i
-            void* group_e_ptr = out_ptr_base + i * out.stride(0) * out_element_size;
-            int lda, ldc;
-            if (std::is_same<ALayout, ck::tensor_layout::gemm::RowMajor>::value) {
-                // Row-major A [M,K]: leading dimension = distance between rows = stride(0)
-                lda = mat_a.stride(0);
-            } else {
-                // Column-major A [M,K]: leading dimension = distance between columns = stride(1)
-                lda = mat_a.stride(1);
-            }
-            // Output is always row-major in 3D tensor [num_groups, M, N]
-            // Leading dimension for each group's [M,N] slice = stride(1) = N
-            ldc = out.stride(1);
-            size_t output_group_bytes = M * N * out_element_size;
-            void* group_e_ptr_end = (char*)group_e_ptr + output_group_bytes;
-
-            gemm_descs.push_back({
-                static_cast<ck::index_t>(M),
-                static_cast<ck::index_t>(N),
-                static_cast<ck::index_t>(k),
-                static_cast<ck::index_t>(lda),
-                static_cast<ck::index_t>(ldb),
-                static_cast<ck::index_t>(ldc),
-                {} // --> stride_Ds_
-            });
-            p_a_ptrs.push_back(group_a_ptr);
-            p_b_ptrs.push_back(group_b_ptr);
-            p_e_ptrs.push_back(group_e_ptr);
-        }
-    } else if (mat_a_dim == 2 && mat_b_dim == 3) {
-        // 2D*3D case requires offset tensor
-        auto offs_accessor = offs->accessor<int, 1>();
-        int num_groups = offs_accessor.size(0);
-
-        // 2d*3d input, output is 2d.
-        // A: [m * n_groups, k], B: [n_groups, n, k] or [n_groups, k, n], Output: [m * n_groups, n]
-        // Offset divides M dimension (rows of A), each group gets different rows of A and different batch of B
-        const int K = mat_a.size(1); // columns in A
-        // For 2D-3D case: The output determines N (result width)
-        const int N = out.size(1); // N is the width of the output tensor
-
-        for (int i = 0; i < num_groups; ++i) {
-            int start_m = (i == 0) ? 0 : offs_accessor[i - 1];
-            int end_m = offs_accessor[i];
-            int m = end_m - start_m;
-
-            // Skip zero-sized groups but continue processing subsequent groups
-            if (m <= 0) {
-                continue;
-            }
-
-            // Select A rows for group i: skip start_m rows
-            const void* group_a_ptr;
-            int lda;
-            if (std::is_same<ALayout, ck::tensor_layout::gemm::RowMajor>::value) {
-                // Row-major A [total_m, K]: skip start_m rows, each row is stride(0) elements apart
-                group_a_ptr = a_ptr_base + start_m * mat_a.stride(0) * a_element_size;
-                lda = mat_a.stride(0); // distance between rows
-            } else {
-                // Column-major A [total_m, K]: skip start_m elements in the first dimension (stride(0) is between rows)
-                group_a_ptr = a_ptr_base + start_m * mat_a.stride(0) * a_element_size;
-
-                // Detect stride pattern for A tensor to determine appropriate lda calculation
-                bool a_is_strided_tensor = (mat_a.stride(0) > mat_a.size(0));
-
-                if (a_is_strided_tensor) {
-                    // For strided A tensors: stride(0) gives the actual leading dimension
-                    lda = mat_a.stride(0);
-                } else {
-                    // For non-strided A tensors: use the M dimension (total rows)
-                    lda = mat_a.size(0); // Total M dimension for column-major layout
-                }
-            }
-
-            // Select B batch for group i: B[i, :, :]
-            const void* group_b_ptr = b_ptr_base + i * mat_b.stride(0) * b_element_size;
-            int ldb;
-
-            if (std::is_same<BLayout, ck::tensor_layout::gemm::RowMajor>::value) {
-                // Row-major GEMM: expecting B as [K, N] but we have [N, K], so transpose needed
-                ldb = mat_b.stride(2); // Leading dimension for accessing as [K, N]
-            } else {
-                // Detect stride pattern to determine appropriate ldb calculation
-                bool is_strided_tensor = (mat_b.stride(2) > mat_b.size(2));
-
-                if (is_strided_tensor) {
-                    // For strided tensors: stride(2) gives the actual leading dimension
-                    ldb = mat_b.stride(2);
-                } else {
-                    // For non-strided tensors: use the N dimension
-                    ldb = mat_b.size(1);
-                }
-            }
-
-            // Output for this group: rows [start_m:end_m, :] in 2D output [total_m, N]
-            void* group_e_ptr = out_ptr_base + start_m * out.stride(0) * out_element_size;
-            int ldc = out.stride(0); // distance between rows in output (should be N for 2D case)
-
-            gemm_descs.push_back({
-                static_cast<ck::index_t>(m),
-                static_cast<ck::index_t>(N),
-                static_cast<ck::index_t>(K),
-                static_cast<ck::index_t>(lda),
-                static_cast<ck::index_t>(ldb),
-                static_cast<ck::index_t>(ldc),
-                {} // --> stride_Ds_
-            });
-            p_a_ptrs.push_back(group_a_ptr);
-            p_b_ptrs.push_back(group_b_ptr);
-            p_e_ptrs.push_back(group_e_ptr);
-        }
-    } else if (mat_a_dim == 3 && mat_b_dim == 3) {
-        // 3d*3d input, output is 3d - batched matrix multiplication
-        // A: [batch, m, k], B: [batch, k, n] or [batch, n, k] (depending on transpose), Output: [batch, m, n]
-        // Each batch is processed as a separate GEMM operation
-        const int batch_size = mat_a.size(0);
-        const int M = mat_a.size(1); // rows in each A matrix
-        const int K = mat_a.size(2); // columns in A == rows in B (or columns if B is transposed)
-
-        // Determine N from B tensor - it could be B.size(1) or B.size(2) depending on layout
-        int N;
-        if (mat_b.size(1) == K) {
-            // B is [batch, k, n] - normal layout
-            N = mat_b.size(2);
-        } else if (mat_b.size(2) == K) {
-            // B is [batch, n, k] - transposed layout
-            N = mat_b.size(1);
-        } else {
-            TORCH_CHECK(false, "CK Group GEMM 3D-3D: B tensor dimensions incompatible with A. A=[",
-                       batch_size, ",", M, ",", K, "], B=[", mat_b.size(0), ",", mat_b.size(1), ",", mat_b.size(2), "]");
-        }
-
-        for (int i = 0; i < batch_size; ++i) {
-            // Select A batch for group i: A[i, :, :]
-            const void* group_a_ptr = a_ptr_base + i * mat_a.stride(0) * a_element_size;
-
-            // Select B batch for group i: B[i, :, :]
-            const void* group_b_ptr = b_ptr_base + i * mat_b.stride(0) * b_element_size;
-
-            // Select output batch for group i: Output[i, :, :]
-            void* group_e_ptr = out_ptr_base + i * out.stride(0) * out_element_size;
-
-            int lda, ldb, ldc;
-
-            if (std::is_same<ALayout, ck::tensor_layout::gemm::RowMajor>::value) {
-                // Row-major A: leading dimension = distance between rows = stride(1)
-                lda = mat_a.stride(1);
-            } else {
-                // Column-major A: leading dimension = distance between columns = stride(2)
-                lda = mat_a.stride(2);
-            }
-
-            if (std::is_same<BLayout, ck::tensor_layout::gemm::RowMajor>::value) {
-                // Row-major B: leading dimension = distance between rows
-                if (mat_b.size(1) == K) {
-                    // B is [batch, k, n] - normal layout
-                    ldb = mat_b.stride(1); // stride between K rows
-                } else {
-                    // B is [batch, n, k] - transposed layout, treat as [k, n] for GEMM
-                    ldb = mat_b.stride(2); // stride between N rows (since we're accessing as [k,n])
-                }
-            } else {
-                // Column-major B: leading dimension = distance between columns
-                if (mat_b.size(1) == K) {
-                    // B is [batch, k, n] - normal layout
-                    ldb = mat_b.stride(2); // stride between N columns
-                } else {
-                    // B is [batch, n, k] - transposed layout
-                    ldb = mat_b.stride(1); // stride between K columns (since we're accessing as [n,k]→[k,n])
-                }
-            }
-
-            // Output is typically row-major: leading dimension = distance between rows = stride(1)
-            ldc = out.stride(1);
-
-            gemm_descs.push_back({
-                static_cast<ck::index_t>(M),
-                static_cast<ck::index_t>(N),
-                static_cast<ck::index_t>(K),
-                static_cast<ck::index_t>(lda),
-                static_cast<ck::index_t>(ldb),
-                static_cast<ck::index_t>(ldc),
-                {} // --> stride_Ds_
-            });
-            p_a_ptrs.push_back(group_a_ptr);
-            p_b_ptrs.push_back(group_b_ptr);
-            p_e_ptrs.push_back(group_e_ptr);
-        }
-    } else if (mat_a_dim == 3 && mat_b_dim == 2) {
-        // 3D*2D case requires offset tensor
-        auto offs_accessor = offs->accessor<int, 1>();
-        int num_groups = offs_accessor.size(0);
-        // 3d*2d input, output is 3d.
-        // A: [n_groups, m, k], B: [k, total_n] (assuming row-major for both)
-        // Offset divides N dimension of B, each group gets different slice of B and different batch of A
-        const int batch_size = mat_a.size(0); // n_groups
-        const int M = mat_a.size(1); // rows in each A matrix
-        const int K = mat_a.size(2); // columns in A
-
-        // For row-major A and B case: B should be [K, total_N]
-        const int total_N = mat_b.size(1); // B is [K, total_N] for row-major
-
-        for (int i = 0; i < num_groups; ++i) {
-            int start_n = (i == 0) ? 0 : offs_accessor[i - 1];
-            int end_n = offs_accessor[i];
-            int n = end_n - start_n;
-
-            // Skip zero-sized groups but continue processing subsequent groups
-            if (n <= 0) {
-                continue;
-            }
-
-            // Select A batch for group i: A[i, :, :]
-            const void* group_a_ptr = a_ptr_base + i * mat_a.stride(0) * a_element_size;
-
-            // Select B slice for group i: B[:, start_n:end_n] (B[K, total_N])
-            const void* group_b_ptr;
-            int ldb;
-
-            // Check if B is row-major or column-major
-            if (std::is_same<BLayout, ck::tensor_layout::gemm::RowMajor>::value) {
-                // Row-major B [K, total_N]: slice columns [start_n:end_n]
-                group_b_ptr = b_ptr_base + start_n * mat_b.stride(1) * b_element_size;
-                ldb = mat_b.stride(0); // distance between rows (should be total_N)
-            } else {
-                // Column-major B [K, total_N]: slice columns [start_n:end_n]
-                group_b_ptr = b_ptr_base + start_n * mat_b.stride(1) * b_element_size;
-                ldb = mat_b.stride(1); // distance between columns (should be K)
-            }
-
-            // Select output slice for group i: Output[:, start_n:end_n]
-            void* group_e_ptr = out_ptr_base + start_n * out.stride(1) * out_element_size;
-
-            int lda, ldc;
-
-            // Row-major A: leading dimension = distance between rows = stride(1)
-            lda = mat_a.stride(1);
-            // Output is row-major: leading dimension = distance between rows = stride(0)
-            ldc = out.stride(0);
-
-            gemm_descs.push_back({
-                static_cast<ck::index_t>(M),
-                static_cast<ck::index_t>(n),
-                static_cast<ck::index_t>(K),
-                static_cast<ck::index_t>(lda),
-                static_cast<ck::index_t>(ldb),
-                static_cast<ck::index_t>(ldc),
-                {} // --> stride_Ds_
-            });
-            p_a_ptrs.push_back(group_a_ptr);
-            p_b_ptrs.push_back(group_b_ptr);
-            p_e_ptrs.push_back(group_e_ptr);
-        }
-    } else {
-        TORCH_CHECK(false, "CK Group GEMM: Unsupported dimensions, mat A dim is ", mat_a_dim, ", mat B dim is ", mat_b_dim);
-    }
-
-    TORCH_INTERNAL_ASSERT(p_a_ptrs.size() > 0, "CK Group GEMM: No valid groups");
-
-    // Initialize d_ptrs with the correct size
-    std::vector<std::array<const void*, 0>> d_ptrs(p_a_ptrs.size());
-
-    static DeviceOp gemm_instance;
-    auto argument = gemm_instance.MakeArgument(
-        p_a_ptrs, p_b_ptrs, d_ptrs, p_e_ptrs,
-        gemm_descs, PassThrough{}, PassThrough{}, PassThrough{}
-    );
-    TORCH_INTERNAL_ASSERT(gemm_instance.IsSupportedArgument(argument),
-        "CK Group GEMM: argument unsupported (shape/strides/type config)");
-    size_t arg_buf_size = gemm_instance.GetDeviceKernelArgSize(&argument);
-    size_t ws_size = gemm_instance.GetWorkSpaceSize(&argument);
-
-    void* gemm_arg_buf = nullptr;
-    void* ws_buf = nullptr;
-
-    hipMalloc(&gemm_arg_buf, arg_buf_size);
-    hipMalloc(&ws_buf, ws_size);
-
-    gemm_instance.SetDeviceKernelArgs(&argument, gemm_arg_buf);
-    gemm_instance.SetWorkSpacePointer(&argument, ws_buf);
-
-    auto invoker = gemm_instance.MakeInvoker();
-    hipStream_t stream = c10::hip::getCurrentHIPStream();
-    invoker.Run(argument, {stream});
-    hipFree(gemm_arg_buf);
-    hipFree(ws_buf);
-}
-
-void group_gemm_ck(
-    const at::Tensor& input_a,
-    const at::Tensor& input_b_colmajor,
-    const std::optional<at::Tensor>& offs,
-    const std::optional<at::Tensor>& /*bias*/,
-    at::Tensor& out)
-{
-    // Detect if input_a is row-major based on stride pattern
-    bool a_row_major = (input_a.dim() == 3) ? (input_a.stride(2) == 1) : (input_a.stride(1) == 1);
-    bool b_col_major = (input_b_colmajor.dim() == 3) ? (input_b_colmajor.stride(1) == 1) : (input_b_colmajor.stride(0) == 1);
-    // Ensure tensor A is row-major and contiguous if not already
-    at::Tensor mat_a = input_a;
-    if (!a_row_major) {
-        // If A is not row-major, make it contiguous (row-major)
-        mat_a = input_a.contiguous();
-    }
-    // Force tensor B to be column-major using double transpose trick
-    // This guarantees stride(0) == 1 and stride(1) == K for [K, N] shape
-    at::Tensor mat_b = input_b_colmajor;
-    if (!b_col_major) {
-        mat_b = input_b_colmajor.transpose(-2, -1).contiguous().transpose(-2, -1);
-    }
-
-    // For 3D tensors, check the last dimension stride for row-major detection
-    a_row_major = (mat_a.dim() == 3) ? (mat_a.stride(2) == 1) : (mat_a.stride(1) == 1);
-    bool b_row_major = (mat_b.dim() == 3) ? (mat_b.stride(2) == 1) : (mat_b.stride(1) == 1);
-
-    if (mat_a.dtype() == at::kBFloat16) {
-        // bf16 path
-        if (a_row_major && b_row_major) {
-            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor, CkTypes::BF16>(mat_a, mat_b, offs, out);
-        } else if (a_row_major && !b_row_major) {
-            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::ColumnMajor, CkTypes::BF16>(mat_a, mat_b, offs, out);
-        } else if (!a_row_major && b_row_major) {
-            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::RowMajor, CkTypes::BF16>(mat_a, mat_b, offs, out);
-        } else {
-            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::ColumnMajor, CkTypes::BF16>(mat_a, mat_b, offs, out);
-        }
-    } else if (mat_a.dtype() == at::kHalf) {
-        // fp16 path
-        if (a_row_major && b_row_major) {
-            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor, CkTypes::F16>(mat_a, mat_b, offs, out);
-        } else if (a_row_major && !b_row_major) {
-            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::ColumnMajor, CkTypes::F16>(mat_a, mat_b, offs, out);
-        } else if (!a_row_major && b_row_major) {
-            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::RowMajor, CkTypes::F16>(mat_a, mat_b, offs, out);
-        } else {
-            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::ColumnMajor, CkTypes::F16>(mat_a, mat_b, offs, out);
-        }
-    } else if (mat_a.dtype() == at::kFloat) {
-        // fp32 path
-        if (a_row_major && b_row_major) {
-            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::RowMajor, CkTypes::F32>(mat_a, mat_b, offs, out);
-        } else if (a_row_major && !b_row_major) {
-            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::RowMajor, ck::tensor_layout::gemm::ColumnMajor, CkTypes::F32>(mat_a, mat_b, offs, out);
-        } else if (!a_row_major && b_row_major) {
-            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::RowMajor, CkTypes::F32>(mat_a, mat_b, offs, out);
-        } else {
-            launch_grouped_bgemm_ck_impl_dispatch<ck::tensor_layout::gemm::ColumnMajor, ck::tensor_layout::gemm::ColumnMajor, CkTypes::F32>(mat_a, mat_b, offs, out);
-        }
-    } else {
-        TORCH_CHECK(false, "CK Group GEMM: Unsupported mat_a dtype");
-    }
-
-}
-
-} // namespace detail
-} // namespace hip
-} // namespace at
--- a/aten/src/ATen/native/mkldnn/xpu/detail/QConv.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/QConv.cpp
@ -133,7 +133,7 @@ at::Tensor quantized_convolution(
  // supported in conv.
  mask_weight = weight_zero_points.numel() > 1 ? 1 : 0;
  if (groups > 1 && weight_zero_points.numel() > 1)
-    mask_weight = (1 << 0) | (1 << 1); // 2^0 (group) | 2^1 (output channel)
+    mask_weight = (2 ^ 0) | (2 ^ 1); // 2^0 (group) | 2^1 (output channel)
  dnnl::primitive_attr pattr;

  bool src_need_zp = (act_zero_point != 0);
--- a/aten/src/ATen/native/mps/operations/Blas.mm
+++ b/aten/src/ATen/native/mps/operations/Blas.mm
@ -141,9 +141,6 @@ static Tensor& addmv_out_mps_impl(const Tensor& self,
  };

  MPSStream* stream = at::mps::getCurrentMPSStream();
-  if (result.numel() == 0) {
-    return result;
-  }
  Tensor matMulVec = at::mm(mat, vec.unsqueeze(1)).squeeze(1);

  @autoreleasepool {
--- a/aten/src/ATen/native/mps/operations/LossOps.mm
+++ b/aten/src/ATen/native/mps/operations/LossOps.mm
@ -212,12 +212,17 @@ static Tensor& bce_loss_out_impl(const Tensor& input,
  loss.resize_((reduction == Reduction::None || grad_output.defined()) ? target.sizes() : IntArrayRef({}));
  TORCH_CHECK(loss.is_mps());

+  Tensor loss_squeezed = loss.squeeze();
+  Tensor input_squeezed = input.squeeze();
+  Tensor target_squeezed = target.squeeze();
+
  @autoreleasepool {
-    std::string key = op_name + reductionToString(reduction) + getTensorsStringKey({input, target, weight});
+    std::string key =
+        op_name + reductionToString(reduction) + getTensorsStringKey({input_squeezed, target_squeezed, weight});

    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
-      newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input);
-      newCachedGraph->targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target);
+      newCachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_squeezed);
+      newCachedGraph->targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target_squeezed);

      MPSGraphTensor* bceLossUnweighted = nil;
      // if grad_output is defined, then it's a backward pass
@ -247,12 +252,12 @@ static Tensor& bce_loss_out_impl(const Tensor& input,
          newCachedGraph->gradInputTensor = bceLoss;
        }
      } else {
-        newCachedGraph->lossTensor = reduceTensor(bceLoss, reduction, mpsGraph, input.sizes().size());
+        newCachedGraph->lossTensor = reduceTensor(bceLoss, reduction, mpsGraph, input_squeezed.sizes().size());
      }
    });
-    Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor, input);
-    Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor, target);
-    Placeholder lossPlaceholder = Placeholder(cachedGraph->lossTensor, loss);
+    Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor, input_squeezed);
+    Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor, target_squeezed);
+    Placeholder lossPlaceholder = Placeholder(cachedGraph->lossTensor, loss_squeezed);

    NSMutableDictionary* feeds = [[NSMutableDictionary new] autorelease];

--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -2803,7 +2803,7 @@
 - func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
  dispatch:
-    CPU, CUDA, MPS, MTIA: floor_divide_out
+    CPU, CUDA, MPS: floor_divide_out
    SparseCPU, SparseCUDA, SparseMPS: floor_divide_out_sparse_zerodim

 - func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
@ -4292,7 +4292,6 @@
  dispatch:
    SparseCPU: sparse_sparse_matmul_cpu
    SparseCUDA: sparse_sparse_matmul_cuda
-    SparseMPS: sparse_sparse_matmul_mps
  autogen: _sparse_sparse_matmul.out

 - func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
@ -4384,7 +4383,7 @@
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: mv
-    SparseCPU, SparseCUDA, SparseMPS: mv_sparse
+    SparseCPU, SparseCUDA: mv_sparse

 - func: mv.out(Tensor self, Tensor vec, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
@ -9833,7 +9832,7 @@
  structured_delegate: erfinv.out
  variants: method, function
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse
+    SparseCPU, SparseCUDA: erfinv_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr
  tags: pointwise

@ -9842,7 +9841,7 @@
  structured_delegate: erfinv.out
  variants: method
  dispatch:
-    SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse_
+    SparseCPU, SparseCUDA: erfinv_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr_
  tags: pointwise

@ -9852,7 +9851,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS: erfinv_out
-    SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse_out
+    SparseCPU, SparseCUDA: erfinv_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr_out
  tags: pointwise

--- a/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
+++ b/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
@ -10,10 +10,6 @@
 #include <ATen/NativeFunctions.h>
 #else
 #include <ATen/ops/_coalesce_native.h>
-#include <ATen/ops/repeat_interleave_native.h>
-#include <ATen/ops/cumsum.h>
-#include <ATen/ops/_sparse_sparse_matmul_native.h>
-#include <ATen/ops/_sparse_coo_tensor_unsafe.h>
 #include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
 #include <ATen/ops/cat.h>
 #include <ATen/ops/add_native.h>
@ -892,114 +888,5 @@ static void sparse_mask_intersection_out_mps_kernel(
      /*coalesce_mask=*/false);
 }

-Tensor sparse_sparse_matmul_mps(const Tensor& mat1_, const Tensor& mat2_) {
-  TORCH_CHECK(mat1_.is_sparse() && mat2_.is_sparse(),
-              "sparse_sparse_matmul_mps: both inputs must be sparse COO tensors");
-  TORCH_CHECK(mat1_.is_mps() && mat2_.is_mps(),
-              "sparse_sparse_matmul_mps: both inputs must be on MPS device");
-  TORCH_CHECK(mat1_.dim() == 2 && mat2_.dim() == 2,
-              "sparse_sparse_matmul_mps: both inputs must be 2D matrices");
-  TORCH_CHECK(mat1_.dense_dim() == 0 && mat2_.dense_dim() == 0,
-              "sparse_sparse_matmul_mps: only scalar values supported (dense_dim == 0)");
-  TORCH_CHECK(mat1_.size(1) == mat2_.size(0),
-              "mat1 and mat2 shapes cannot be multiplied (", mat1_.size(0), "x", mat1_.size(1), " and ", mat2_.size(0), "x", mat2_.size(1), ")");
-  TORCH_CHECK(mat1_.scalar_type() == mat2_.scalar_type(),
-              "sparse_sparse_matmul_mps: mat1 dtype ", mat1_.scalar_type(),
-              " does not match mat2 dtype ", mat2_.scalar_type());
-
-  const auto device = mat1_.device();
-
-  auto A = mat1_.coalesce();
-  auto B = mat2_.coalesce();
-
-  const auto I = A.size(0);
-  const auto K = A.size(1);
-  const auto N = B.size(1);
-
-  const auto nnzA = A._nnz();
-  const auto nnzB = B._nnz();
-
-  // Early empty result, return an empty, coalesced tensor
-  if (I == 0 || N == 0 || K == 0 || nnzA == 0 || nnzB == 0) {
-    auto empty_idx = at::empty({2, 0}, at::device(device).dtype(at::kLong));
-    auto empty_val = at::empty({0}, at::device(device).dtype(mat1_.scalar_type()));
-    auto out = _sparse_coo_tensor_unsafe(empty_idx, empty_val, {I, N}, mat1_.options());
-    out._coalesced_(true);
-    return out;
-  }
-
-  const auto computeDtype = at::result_type(mat1_, mat2_);
-
-  auto A_idx = A._indices().contiguous();
-  auto A_val = A._values().to(computeDtype).contiguous();
-  auto A_i = A_idx.select(0, 0).contiguous();
-  auto A_k = A_idx.select(0, 1).contiguous();
-
-  auto B_idx = B._indices().contiguous();
-  auto B_val = B._values().to(computeDtype).contiguous();
-  auto B_k = B_idx.select(0, 0).contiguous();
-  auto B_j = B_idx.select(0, 1).contiguous();
-
-  // csr-style row pointers for B by k (the shared dimension)
-  Tensor row_ptr_B;
-  {
-    auto batch_ptr = at::tensor({0LL, nnzB}, at::device(device).dtype(at::kLong));
-    row_ptr_B = at::empty({K + 1}, at::device(device).dtype(at::kLong));
-    build_row_ptr_per_batch_mps(B_k, batch_ptr, /*B=*/1, /*I=*/K, row_ptr_B);
-  }
-
-  auto row_ptr_B_lo = row_ptr_B.narrow(0, 0, K);
-  auto row_ptr_B_hi = row_ptr_B.narrow(0, 1, K);
-  auto deg_B = row_ptr_B_hi.sub(row_ptr_B_lo);
-
-  auto counts = deg_B.index_select(0, A_k);
-
-  const int64_t P = counts.sum().item<int64_t>();
-  if (P == 0) {
-    auto empty_idx = at::empty({2, 0}, at::device(device).dtype(at::kLong));
-    auto empty_val = at::empty({0}, at::device(device).dtype(mat1_.scalar_type()));
-    auto out = _sparse_coo_tensor_unsafe(empty_idx, empty_val, {I, N}, mat1_.options());
-    out._coalesced_(true);
-    return out;
-  }
-
-  auto group_ids = repeat_interleave_mps(counts);
-
-  // exclusive cumsum of counts
-  auto offsets = cumsum(counts, /*dim=*/0).sub(counts);
-  auto offsets_gather = offsets.index_select(0, group_ids);
-  auto within = at::arange(P, at::device(device).dtype(at::kLong)).sub(offsets_gather);
-
-  // Map each output element to its source B row and position
-  auto k_per_out = A_k.index_select(0, group_ids);
-  auto start_in_B = row_ptr_B.index_select(0, k_per_out);
-  auto seg_index = start_in_B.add(within);
-
-  // Assemble candidate coo pairs and values
-  auto i_out = A_i.index_select(0, group_ids).contiguous();
-  auto j_out = B_j.index_select(0, seg_index).contiguous();
-  auto vA_out = A_val.index_select(0, group_ids).contiguous();
-  auto vB_out = B_val.index_select(0, seg_index).contiguous();
-  auto v_out = vA_out.mul(vB_out);
-
-  // build (2, P) indices
-  auto out_indices = at::empty({2, P}, at::device(device).dtype(at::kLong)).contiguous();
-  out_indices.select(0, 0).copy_(i_out);
-  out_indices.select(0, 1).copy_(j_out);
-
-  auto result = _sparse_coo_tensor_unsafe(
-      out_indices, v_out, {I, N}, mat1_.options().dtype(computeDtype));
-
-  result = result.coalesce();
-
-  if (result.scalar_type() != mat1_.scalar_type()) {
-    auto cast_vals = result._values().to(mat1_.scalar_type());
-    auto out = _sparse_coo_tensor_unsafe(result._indices(), cast_vals, {I, N}, mat1_.options());
-    out._coalesced_(true);
-    return out;
-  }
-  return result;
-}
-
 REGISTER_MPS_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_mps_kernel);
 } // namespace at::native
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
@ -478,7 +478,7 @@ bool check_cudnn_tensor_shapes(sdp_params const& params, bool debug) {
  const auto s_k = params.key.sym_size(2);
  const auto d_qk = params.query.sym_size(3);
  const auto d_v = params.value.sym_size(3);
-  long cudnn_version = at::detail::getCUDAHooks().versionRuntimeCuDNN();
+  long cudnn_version = at::detail::getCUDAHooks().versionCuDNN();
  if (cudnn_version < 8903) {
    if (debug) {
      TORCH_WARN("SDPA fprop requires cudnn 8.9.3 or higher");
@ -709,7 +709,7 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
  return false;
 #endif
 #if defined(CUDNN_VERSION)
-  static auto cudnn_version = at::detail::getCUDAHooks().versionRuntimeCuDNN();
+  static auto cudnn_version = cudnnGetVersion();
  if (params.dropout > 0.0 && cudnn_version > 91100 && cudnn_version < 91400) {
    if (debug) {
      TORCH_WARN(CUDNN_VERSION, " cuDNN version does not support droppout in SDPA (9.11 - 9.13).");
--- a/benchmarks/operator_benchmark/pt/addmm_test.py
+++ b/benchmarks/operator_benchmark/pt/addmm_test.py
@ -53,8 +53,10 @@ class AddmmBenchmark(op_bench.TorchBenchmarkBase):
        return torch.addmm(input_one, mat1, mat2)


-op_bench.generate_pt_test(addmm_short_configs + addmm_long_configs, AddmmBenchmark)
-op_bench.generate_pt_gradient_test(addmm_long_configs, AddmmBenchmark)
+op_bench.generate_pt_test(addmm_long_configs + addmm_long_configs, AddmmBenchmark)
+op_bench.generate_pt_gradient_test(
+    addmm_long_configs + addmm_long_configs, AddmmBenchmark
+)

 """Mircobenchmark for addbmm operator."""

@ -105,7 +107,9 @@ addbmm_short_configs = op_bench.cross_product_configs(
 )

 op_bench.generate_pt_test(addbmm_long_configs + addbmm_short_configs, AddbmmBenchmark)
-op_bench.generate_pt_gradient_test(addbmm_long_configs, AddbmmBenchmark)
+op_bench.generate_pt_gradient_test(
+    addbmm_long_configs + addbmm_short_configs, AddbmmBenchmark
+)

 if __name__ == "__main__":
    op_bench.benchmark_runner.main()
--- a/benchmarks/sparse/spmm.py
+++ b/benchmarks/sparse/spmm.py
@ -52,18 +52,19 @@ def test_sparse_coo_and_csr(m, n, k, nnz, test_count):
        start.record()
        coo.matmul(mat)
        stop.record()
+
        times.append(start.elapsed_time(stop))

-    coo_mean_time = sum(times) / len(times)
+        coo_mean_time = sum(times) / len(times)

-    times = []
-    for _ in range(test_count):
-        start.record()
-        csr.matmul(mat)
-        stop.record()
-        times.append(start.elapsed_time(stop))
+        times = []
+        for _ in range(test_count):
+            start.record()
+            csr.matmul(mat)
+            stop.record()
+            times.append(start.elapsed_time(stop))

-    csr_mean_time = sum(times) / len(times)
+            csr_mean_time = sum(times) / len(times)

    return coo_mean_time, csr_mean_time

--- a/c10/core/AutogradState.h
+++ b/c10/core/AutogradState.h
@ -1,8 +1,6 @@
 #pragma once

-#include <c10/core/SafePyObject.h>
 #include <c10/macros/Export.h>
-#include <optional>

 namespace c10 {

@ -17,8 +15,7 @@ struct C10_API AutogradState {
      bool inference_mode,
      bool fw_grad_mode,
      bool multithreading_enabled)
-      : graph_exec_group_(std::nullopt),
-        grad_mode_(grad_mode),
+      : grad_mode_(grad_mode),
        inference_mode_(inference_mode),
        fw_grad_mode_(fw_grad_mode),
        multithreading_enabled_(multithreading_enabled),
@ -44,10 +41,6 @@ struct C10_API AutogradState {
    view_replay_enabled_ = view_replay_enabled;
  }

-  void set_graph_exec_group(std::optional<SafePyObject> group) {
-    graph_exec_group_ = std::move(group);
-  }
-
  bool get_grad_mode() const {
    return grad_mode_;
  }
@ -68,12 +61,7 @@ struct C10_API AutogradState {
    return view_replay_enabled_;
  }

-  const std::optional<SafePyObject>& get_graph_exec_group() const {
-    return graph_exec_group_;
-  }
-
 private:
-  std::optional<SafePyObject> graph_exec_group_;
  bool grad_mode_ : 1;
  bool inference_mode_ : 1;
  bool fw_grad_mode_ : 1;
--- a/c10/core/CachingDeviceAllocator.h
+++ b/c10/core/CachingDeviceAllocator.h
@ -96,10 +96,6 @@ struct C10_API DeviceAllocator : public c10::Allocator {

  // Resets peak memory usage statistics for the specified device
  virtual void resetPeakStats(c10::DeviceIndex device) = 0;
-
-  // Return the free memory size and total memory size in bytes for the
-  // specified device.
-  virtual std::pair<size_t, size_t> getMemoryInfo(c10::DeviceIndex device) = 0;
 };

 // This function is used to get the DeviceAllocator for a specific device type
--- a/c10/core/SymBool.cpp
+++ b/c10/core/SymBool.cpp
@ -1,5 +1,4 @@
 #include <c10/core/SymBool.h>
-#include <c10/core/SymInt.h>
 #include <c10/core/SymNodeImpl.h>

 namespace c10 {
@ -112,17 +111,4 @@ bool SymBool::has_hint() const {
  return toSymNodeImpl()->has_hint();
 }

-SymInt SymBool::toSymInt() const {
-  // If concrete bool, return concrete SymInt
-  if (auto ma = maybe_as_bool()) {
-    return SymInt(*ma ? 1 : 0);
-  }
-
-  // Symbolic case: use sym_ite to convert bool to int (0 or 1)
-  auto node = toSymNodeImpl();
-  auto one_node = node->wrap_int(1);
-  auto zero_node = node->wrap_int(0);
-  return SymInt(node->sym_ite(one_node, zero_node));
-}
-
 } // namespace c10
--- a/c10/core/SymBool.h
+++ b/c10/core/SymBool.h
@ -12,8 +12,6 @@

 namespace c10 {

-class SymInt;
-
 class C10_API SymBool {
 public:
  /*implicit*/ SymBool(bool b) : data_(b) {}
@ -82,10 +80,6 @@ class C10_API SymBool {
    return toSymNodeImplUnowned()->constant_bool();
  }

-  // Convert SymBool to SymInt (0 or 1)
-  // This is the C++ equivalent of Python's cast_symbool_to_symint_guardless
-  SymInt toSymInt() const;
-
  bool is_heap_allocated() const {
    return ptr_;
  }
--- a/c10/cuda/CUDAAllocatorConfig.cpp
+++ b/c10/cuda/CUDAAllocatorConfig.cpp
@ -106,9 +106,6 @@ void CUDAAllocatorConfig::parseArgs(const std::string& env) {
    } else if (key == "graph_capture_record_stream_reuse") {
      i = parseGraphCaptureRecordStreamReuse(tokenizer, i);
      used_native_specific_option = true;
-    } else if (key == "per_process_memory_fraction") {
-      i = parsePerProcessMemoryFraction(tokenizer, i);
-      used_native_specific_option = true;
    } else {
      const auto& keys =
          c10::CachingAllocator::AcceleratorAllocatorConfig::getKeys();
@ -149,18 +146,6 @@ size_t CUDAAllocatorConfig::parseGraphCaptureRecordStreamReuse(
  return i;
 }

-double CUDAAllocatorConfig::parsePerProcessMemoryFraction(
-    const c10::CachingAllocator::ConfigTokenizer& tokenizer,
-    size_t i) {
-  tokenizer.checkToken(++i, ":");
-  double val_env = tokenizer.toDouble(++i);
-  TORCH_CHECK_VALUE(
-      val_env >= 0.0 && val_env <= 1.0,
-      "per_process_memory_fraction is invalid, set it in [0.0, 1.0]");
-  m_per_process_memory_fraction = val_env;
-  return i;
-}
-
 size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads(
    const c10::CachingAllocator::ConfigTokenizer& tokenizer,
    size_t i) {
--- a/c10/cuda/CUDAAllocatorConfig.h
+++ b/c10/cuda/CUDAAllocatorConfig.h
@ -61,10 +61,6 @@ class C10_CUDA_API CUDAAllocatorConfig {
    return instance().m_graph_capture_record_stream_reuse;
  }

-  static double per_process_memory_fraction() {
-    return instance().m_per_process_memory_fraction;
-  }
-
  /** Pinned memory allocator settings */
  static bool pinned_use_cuda_host_register() {
    return instance().m_pinned_use_cuda_host_register;
@ -156,8 +152,7 @@ class C10_CUDA_API CUDAAllocatorConfig {
        "pinned_use_hip_host_register",
        "graph_capture_record_stream_reuse",
        "pinned_reserve_segment_size_mb",
-        "pinned_num_register_threads",
-        "per_process_memory_fraction"};
+        "pinned_num_register_threads"};
    return keys;
  }

@ -182,9 +177,6 @@ class C10_CUDA_API CUDAAllocatorConfig {
  size_t parseGraphCaptureRecordStreamReuse(
      const c10::CachingAllocator::ConfigTokenizer& tokenizer,
      size_t i);
-  double parsePerProcessMemoryFraction(
-      const c10::CachingAllocator::ConfigTokenizer& tokenizer,
-      size_t i);

  std::atomic<size_t> m_pinned_num_register_threads{1};
  std::atomic<size_t> m_pinned_reserve_segment_size_mb{0};
@ -197,7 +189,6 @@ class C10_CUDA_API CUDAAllocatorConfig {
  std::atomic<bool> m_release_lock_on_cudamalloc{false};
  std::atomic<bool> m_pinned_use_cuda_host_register{false};
  std::atomic<bool> m_graph_capture_record_stream_reuse{false};
-  std::atomic<double> m_per_process_memory_fraction{1.0};
 };

 // Keep this for backwards compatibility
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@ -1100,7 +1100,7 @@ class RingBuffer {
 } // anonymous namespace
 } // namespace Native

-static std::string reportProcessMemoryInfo(const cudaDeviceProp& prop) {
+static std::string reportProcessMemoryInfo(c10::DeviceIndex device) {
 #ifdef PYTORCH_C10_DRIVER_API_SUPPORTED
  void* nvml_handle = DriverAPI::get_nvml_handle();
  if (!nvml_handle) {
@ -1111,6 +1111,9 @@ static std::string reportProcessMemoryInfo(const cudaDeviceProp& prop) {
    return true;
  }();

+  cudaDeviceProp prop{};
+  C10_CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
+
  // NOLINTNEXTLINE(*-c-arrays)
  char pci_id[80];
  snprintf(
@ -1212,16 +1215,14 @@ class DeviceCachingAllocator {
  // record used memory.
  size_t total_allocated_memory = 0;

-  cudaDeviceProp device_prop;
-
-  // maximum amount of memory that device is allowed to
-  // allocate. This is set iff memory fraction is less than 1
-  std::optional<size_t> allowed_memory_maximum{std::nullopt};
+  size_t allowed_memory_maximum = 0;

  // all live expandable segments
  std::vector<ExpandableSegment*> expandable_segments_;
  std::vector<c10::DeviceIndex> devices_with_peer_access_;

+  bool set_fraction = false;
+
  bool record_history = false;

  std::atomic<CreateContextFn> context_recorder_;
@ -1263,9 +1264,6 @@ class DeviceCachingAllocator {
      : device_id(id),
        large_blocks(/*small=*/false),
        small_blocks(/*small=*/true) {
-    C10_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, id));
-
-    setMemoryFraction(CUDAAllocatorConfig::per_process_memory_fraction());
    stats.max_split_size =
        static_cast<int64_t>(AcceleratorAllocatorConfig::max_split_size());
    context_recorder_.store(nullptr);
@ -1401,7 +1399,7 @@ class DeviceCachingAllocator {
    if (!block_found) {
      // Do garbage collection if the flag is set.
      if (C10_UNLIKELY(
-              allowed_memory_maximum.has_value() &&
+              set_fraction &&
              AcceleratorAllocatorConfig::garbage_collection_threshold() >
                  0.0)) {
        garbage_collect_cached_blocks(context);
@ -1458,12 +1456,11 @@ class DeviceCachingAllocator {
      C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
      std::string allowed_info;

-      if (allowed_memory_maximum.has_value()) {
-        allowed_info =
-            format_size(allowed_memory_maximum.value()) + " allowed; ";
+      if (set_fraction) {
+        allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
      }

-      std::string proc_info = reportProcessMemoryInfo(device_prop);
+      std::string proc_info = reportProcessMemoryInfo(device_id);

      record_trace(
          TraceEntry::OOM,
@ -1521,7 +1518,7 @@ class DeviceCachingAllocator {
      for (const auto& obs : observers_local) {
        obs(device_id,
            alloc_size,
-            allowed_memory_maximum.value_or(device_total),
+            set_fraction ? allowed_memory_maximum : device_total,
            device_free);
      }

@ -2018,26 +2015,25 @@ class DeviceCachingAllocator {

  /** get memory fraction limiting maximum allocated memory **/
  double getMemoryFraction() {
-    if (!allowed_memory_maximum.has_value()) {
+    if (!set_fraction) {
      return 1.0;
    }

-    return static_cast<double>(allowed_memory_maximum.value()) /
-        static_cast<double>(device_prop.totalGlobalMem);
+    size_t device_free = 0;
+    size_t device_total = 0;
+    C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
+    return static_cast<double>(allowed_memory_maximum) /
+        static_cast<double>(device_total);
  }

  /** set memory fraction to limit maximum allocated memory **/
  void setMemoryFraction(double fraction) {
-    TORCH_CHECK(
-        0 <= fraction && fraction <= 1,
-        "invalid fraction:",
-        fraction,
-        ". Please set within [0, 1].");
-    allowed_memory_maximum = std::nullopt;
-    if (fraction < 1.0) {
-      allowed_memory_maximum = static_cast<size_t>(
-          fraction * static_cast<double>(device_prop.totalGlobalMem));
-    }
+    size_t device_free = 0;
+    size_t device_total = 0;
+    C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
+    allowed_memory_maximum =
+        static_cast<size_t>(fraction * static_cast<double>(device_total));
+    set_fraction = true;
  }

  /** get expandable segment size for all the streams on device **/
@ -3014,7 +3010,7 @@ class DeviceCachingAllocator {
    BlockPool& pool = *p.pool;

    if (C10_UNLIKELY(
-            allowed_memory_maximum.has_value() &&
+            set_fraction &&
            AcceleratorAllocatorConfig::garbage_collection_threshold() > 0.0)) {
      // Track block reuse interval only when garbage collection is enabled.
      ++pool.get_free_blocks_call_count;
@ -3087,7 +3083,7 @@ class DeviceCachingAllocator {

    size_t gc_threshold = static_cast<size_t>(
        AcceleratorAllocatorConfig::garbage_collection_threshold() *
-        static_cast<double>(allowed_memory_maximum.value()));
+        static_cast<double>(allowed_memory_maximum));
    // No need to trigger GC yet
    if (total_allocated_memory <= gc_threshold) {
      return;
@ -3165,8 +3161,8 @@ class DeviceCachingAllocator {

    bool active_pool =
        p.pool->owner_PrivatePool && p.pool->owner_PrivatePool->allocator();
-    if (allowed_memory_maximum.has_value() &&
-        total_allocated_memory + size > allowed_memory_maximum.value()) {
+    if (set_fraction &&
+        total_allocated_memory + size > allowed_memory_maximum) {
      p.err = cudaErrorMemoryAllocation;
      return false;
      // Temporarily disable checkpointing & cudagraphs internally
@ -3863,6 +3859,7 @@ class NativeCachingAllocator : public CUDAAllocator {
        "Allocator not initialized for device ",
        device,
        ": did you call init?");
+    C10_CUDA_CHECK(c10::cuda::SetDevice(device));
    return device_allocator[device]->getMemoryFraction();
  }

@ -3872,6 +3869,12 @@ class NativeCachingAllocator : public CUDAAllocator {
        "Allocator not initialized for device ",
        device,
        ": did you call init?");
+    TORCH_CHECK(
+        0 <= fraction && fraction <= 1,
+        "invalid fraction:",
+        fraction,
+        ". Please set within [0, 1].");
+    C10_CUDA_CHECK(c10::cuda::SetDevice(device));
    device_allocator[device]->setMemoryFraction(fraction);
  }

--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@ -2,7 +2,6 @@

 #include <c10/core/AllocatorConfig.h>
 #include <c10/core/CachingDeviceAllocator.h>
-#include <c10/cuda/CUDAAllocatorConfig.h>
 #include <c10/cuda/CUDAGraphsC10Utils.h>
 #include <c10/cuda/CUDAMacros.h>
 #include <c10/cuda/CUDAStream.h>
@ -345,13 +344,6 @@ class CUDAAllocator : public DeviceAllocator {
      c10::DeviceIndex device,
      std::shared_ptr<AllocatorState> pps) = 0;
  virtual std::string name() = 0;
-  std::pair<size_t, size_t> getMemoryInfo(c10::DeviceIndex device) override {
-    c10::DeviceGuard device_guard({at::kCUDA, device});
-    size_t free = 0;
-    size_t total = 0;
-    C10_CUDA_CHECK(cudaMemGetInfo(&free, &total));
-    return {free, total};
-  }
 };

 // Allocator object, statically initialized
--- a/c10/cuda/CUDAMallocAsyncAllocator.cpp
+++ b/c10/cuda/CUDAMallocAsyncAllocator.cpp
@ -427,6 +427,7 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
  // on the current device each later call sees.
  void init(int dev_count) override {
    static bool called = [](int dev_count) {
+      ;
      // Are there external guarantees init will be called before
      // any of the allocator's other functions?
      // std::lock_guard<std::mutex> lk(general_mutex);
--- a/c10/test/build.bzl
+++ b/c10/test/build.bzl
@ -66,15 +66,6 @@ def define_targets(rules):
        ],
    )

-    rules.cc_test(
-        name = "util/nofatal_test",
-        srcs = ["util/nofatal_test.cpp"],
-        deps = [
-            "//c10/util:base",
-            "@com_google_googletest//:gtest_main",
-        ],
-    )
-
    rules.cc_test(
        name = "util/ssize_test",
        srcs = ["util/ssize_test.cpp"],
--- a/c10/test/util/nofatal_test.cpp
+++ b/c10/test/util/nofatal_test.cpp
@ -1,53 +0,0 @@
-#include <gtest/gtest.h>
-
-#include <c10/util/Exception.h>
-#include <c10/util/Logging.h>
-
-namespace {
-template <typename T>
-inline void expectThrowsEq(T&& fn, const char* expected_msg) {
-  try {
-    std::forward<T>(fn)();
-  } catch (const c10::Error& e) {
-    EXPECT_TRUE(
-        std::string(e.what_without_backtrace()).find(expected_msg) !=
-        std::string::npos);
-    return;
-  }
-  ADD_FAILURE() << "Expected to throw exception with message \"" << expected_msg
-                << "\" but didn't throw";
-}
-} // namespace
-
-TEST(NofatalTest, TorchCheckComparisons) {
-  // quick make sure that no-op works as expected
-  TORCH_CHECK_EQ(1, 1) << "i am a silly message " << 1;
-  expectThrowsEq(
-      []() { TORCH_CHECK_EQ(1, 2) << "i am a silly message " << 1; },
-      "Check failed: 1 == 2 (1 vs. 2). i am a silly message 1");
-  expectThrowsEq(
-      []() { TORCH_CHECK_NE(2, 2); }, "Check failed: 2 != 2 (2 vs. 2).");
-  expectThrowsEq(
-      []() { TORCH_CHECK_LT(2, 2); }, "Check failed: 2 < 2 (2 vs. 2).");
-  expectThrowsEq(
-      []() { TORCH_CHECK_LE(3, 2); }, "Check failed: 3 <= 2 (3 vs. 2).");
-  expectThrowsEq(
-      []() { TORCH_CHECK_GT(2, 2); }, "Check failed: 2 > 2 (2 vs. 2).");
-  expectThrowsEq(
-      []() { TORCH_CHECK_GE(2, 3); }, "Check failed: 2 >= 3 (2 vs. 3).");
-  expectThrowsEq(
-      []() {
-        void* p = nullptr;
-        TORCH_CHECK_NOTNULL(p);
-      },
-      "Check failed: 'p' must be non NULL.");
-
-#if GTEST_HAS_DEATH_TEST
-#ifndef NDEBUG
-  // if dbg build, DCHECK should result in deth
-  EXPECT_DEATH(TORCH_DCHECK_EQ(1, 2), "Check failed");
-#else
-  TORCH_DCHECK_EQ(1, 2); // no-op
-#endif
-#endif // GTEST_HAS_DEATH_TEST
-}
--- a/c10/util/Exception.h
+++ b/c10/util/Exception.h
@ -702,98 +702,6 @@ namespace c10::detail {
 #define TORCH_CHECK_ARG(cond, argN, ...) \
  TORCH_CHECK(cond, "invalid argument ", argN, ": ", __VA_ARGS__)

-#ifndef FATAL_IF
-#ifdef C10_USE_GLOG
-#define FATAL_IF(condition)                                              \
-  condition ? (void)0                                                    \
-            : ::c10::LoggerVoidify() &                                   \
-          ::c10::MessageLogger(__FILE__, __LINE__, ::google::GLOG_FATAL) \
-              .stream()
-#else
-#define FATAL_IF(condition)            \
-  condition ? (void)0                  \
-            : ::c10::LoggerVoidify() & \
-          ::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_FATAL).stream()
-#endif
-#endif
-
-#ifndef NON_FATAL_IF
-#ifdef C10_USE_GLOG
-#define NON_FATAL_IF(condition)                                \
-  condition ? (void)0                                          \
-            : ::c10::LoggerVoidify() &                         \
-          ::c10::MessageLogger(                                \
-              __FILE__, __LINE__, ::google::GLOG_FATAL, false) \
-              .stream()
-#else
-#define NON_FATAL_IF(condition)                                              \
-  condition ? (void)0                                                        \
-            : ::c10::LoggerVoidify() &                                       \
-          ::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_FATAL, false) \
-              .stream()
-#endif
-#endif
-
-// Binary comparison check macros
-#define TORCH_CHECK_OP(val1, val2, op)                                      \
-  NON_FATAL_IF(((val1)op(val2)))                                            \
-      << "Check failed: " #val1 " " #op " " #val2 " (" << (val1) << " vs. " \
-      << (val2) << "). "
-
-#define TORCH_DCHECK_OP(val1, val2, op)                                       \
-  FATAL_IF(((val1)op(val2))) << "Check failed: " #val1 " " #op " " #val2 " (" \
-                             << (val1) << " vs. " << (val2) << "). "
-
-#define TORCH_CHECK_EQ(val1, val2) TORCH_CHECK_OP(val1, val2, ==)
-#define TORCH_CHECK_NE(val1, val2) TORCH_CHECK_OP(val1, val2, !=)
-#define TORCH_CHECK_LE(val1, val2) TORCH_CHECK_OP(val1, val2, <=)
-#define TORCH_CHECK_LT(val1, val2) TORCH_CHECK_OP(val1, val2, <)
-#define TORCH_CHECK_GE(val1, val2) TORCH_CHECK_OP(val1, val2, >=)
-#define TORCH_CHECK_GT(val1, val2) TORCH_CHECK_OP(val1, val2, >)
-
-// Debug versions of TORCH_CHECK_OP macros
-#ifndef NDEBUG
-#define TORCH_DCHECK_EQ(val1, val2) TORCH_DCHECK_OP(val1, val2, ==)
-#define TORCH_DCHECK_NE(val1, val2) TORCH_DCHECK_OP(val1, val2, !=)
-#define TORCH_DCHECK_LE(val1, val2) TORCH_DCHECK_OP(val1, val2, <=)
-#define TORCH_DCHECK_LT(val1, val2) TORCH_DCHECK_OP(val1, val2, <)
-#define TORCH_DCHECK_GE(val1, val2) TORCH_DCHECK_OP(val1, val2, >=)
-#define TORCH_DCHECK_GT(val1, val2) TORCH_DCHECK_OP(val1, val2, >)
-#else // !NDEBUG
-// Optimized versions - generate no code
-#define TORCH_DCHECK_EQ(val1, val2) \
-  while (false)                     \
-  TORCH_DCHECK_OP(val1, val2, ==)
-#define TORCH_DCHECK_NE(val1, val2) \
-  while (false)                     \
-  TORCH_DCHECK_OP(val1, val2, !=)
-#define TORCH_DCHECK_LE(val1, val2) \
-  while (false)                     \
-  TORCH_DCHECK_OP(val1, val2, <=)
-#define TORCH_DCHECK_LT(val1, val2) \
-  while (false)                     \
-  TORCH_DCHECK_OP(val1, val2, <)
-#define TORCH_DCHECK_GE(val1, val2) \
-  while (false)                     \
-  TORCH_DCHECK_OP(val1, val2, >=)
-#define TORCH_DCHECK_GT(val1, val2) \
-  while (false)                     \
-  TORCH_DCHECK_OP(val1, val2, >)
-#endif // NDEBUG
-
-// Null pointer check macro
-#define TORCH_CHECK_NOTNULL(val) \
-  ::c10::CheckNotNull(__FILE__, __LINE__, #val, (val), false)
-
-#ifndef NDEBUG
-#define TORCH_DCHECK_NOTNULL(val) \
-  ::c10::CheckNotNull(__FILE__, __LINE__, #val, (val), true)
-#else // !NDEBUG
-#define TORCH_DCHECK_NOTNULL(val) \
-  while (false)                   \
-  TORCH_CHECK_NOTNULL(val)
-#endif // NDEBUG
-
 // ----------------------------------------------------------------------------
 // Deprecated macros
 // ----------------------------------------------------------------------------
--- a/c10/util/Logging.cpp
+++ b/c10/util/Logging.cpp
@ -291,32 +291,6 @@ namespace c10 {
 using fLB::FLAGS_logtostderr;
 using fLI::FLAGS_minloglevel;
 using fLI::FLAGS_v;
-
-MessageLogger::MessageLogger(
-    const char* file,
-    int line,
-    int severity,
-    bool exit_on_fatal)
-    : stream_(), severity_(severity), exit_on_fatal_(exit_on_fatal) {}
-
-MessageLogger::~MessageLogger() noexcept(false) {
-  if (severity_ == ::google::GLOG_FATAL) {
-    DealWithFatal();
-  }
-}
-
-std::stringstream& MessageLogger::stream() {
-  return stream_;
-}
-
-void MessageLogger::DealWithFatal() {
-  if (exit_on_fatal_) {
-    LOG(FATAL) << stream_.str();
-  } else {
-    throw c10::Error(stream_.str(), nullptr, nullptr);
-  }
-}
-
 } // namespace c10

 C10_DEFINE_int(
@ -438,16 +412,17 @@ void ShowLogInfoToStderr() {
  FLAGS_caffe2_log_level = GLOG_INFO;
 }

-MessageLogger::MessageLogger(
-    const char* file,
-    int line,
-    int severity,
-    bool exit_on_fatal)
-    : severity_(severity), exit_on_fatal_(exit_on_fatal) {
+MessageLogger::MessageLogger(const char* file, int line, int severity)
+    : severity_(severity) {
  if (severity_ < FLAGS_caffe2_log_level) {
    // Nothing needs to be logged.
    return;
  }
+#ifdef ANDROID
+  tag_ = "native";
+#else // !ANDROID
+  tag_ = "";
+#endif // ANDROID

  time_t rawtime = 0;
  time(&rawtime);
@ -483,7 +458,7 @@ MessageLogger::MessageLogger(
 }

 // Output the contents of the stream to the proper channel on destruction.
-MessageLogger::~MessageLogger() noexcept(false) {
+MessageLogger::~MessageLogger() {
  if (severity_ < FLAGS_caffe2_log_level) {
    // Nothing needs to be logged.
    return;
@ -523,18 +498,6 @@ MessageLogger::~MessageLogger() noexcept(false) {
  }
 }

-std::stringstream& MessageLogger::stream() {
-  return stream_;
-}
-
-void MessageLogger::DealWithFatal() {
-  if (exit_on_fatal_) {
-    abort();
-  } else {
-    throw c10::Error(stream_.str(), nullptr, nullptr);
-  }
-}
-
 } // namespace c10

 #endif // !C10_USE_GLOG
--- a/c10/util/logging_common.h
+++ b/c10/util/logging_common.h
@ -1,74 +0,0 @@
-#ifndef C10_UTIL_LOGGING_COMMON_H_
-#define C10_UTIL_LOGGING_COMMON_H_
-
-#include <c10/macros/Export.h>
-#include <sstream>
-
-namespace c10 {
-
-// MessageLogger that throws exceptions instead of aborting (glog version)
-// or logs and may abort (non-glog version).
-class C10_API MessageLogger {
- public:
-  MessageLogger(
-      const char* file,
-      int line,
-      int severity,
-      bool exit_on_fatal = true);
-  ~MessageLogger() noexcept(false);
-
-  // Return the stream associated with the logger object.
-  std::stringstream& stream();
-
- private:
-  // When there is a fatal log, and fatal == true, we abort
-  // otherwise, we throw.
-  void DealWithFatal();
-
-#if defined(ANDROID) && !defined(C10_USE_GLOG)
-  const char* tag_{"native"};
-#endif
-  std::stringstream stream_;
-  int severity_;
-  bool exit_on_fatal_;
-};
-
-// This class is used to explicitly ignore values in the conditional
-// logging macros. This avoids compiler warnings like "value computed
-// is not used" and "statement has no effect".
-class C10_API LoggerVoidify {
- public:
-  LoggerVoidify() = default;
-  // This has to be an operator with a precedence lower than << but
-  // higher than ?:
-  void operator&(const std::ostream& s [[maybe_unused]]) {}
-};
-
-// Forward declarations for CheckNotNull functions
-template <typename T>
-T& CheckNotNullCommon(
-    const char* file,
-    int line,
-    const char* names,
-    T& t,
-    bool fatal = true);
-
-template <typename T>
-T* CheckNotNull(
-    const char* file,
-    int line,
-    const char* names,
-    T* t,
-    bool fatal = true);
-
-template <typename T>
-T& CheckNotNull(
-    const char* file,
-    int line,
-    const char* names,
-    T& t,
-    bool fatal = true);
-
-} // namespace c10
-
-#endif // C10_UTIL_LOGGING_COMMON_H_
--- a/c10/util/logging_is_google_glog.h
+++ b/c10/util/logging_is_google_glog.h
@ -47,53 +47,57 @@ INSTANTIATE_FOR_CONTAINER(set)

 #endif

-#include <c10/util/logging_common.h>
 #include <glog/logging.h>

-namespace c10 {
+// Additional macros on top of glog
+#define TORCH_CHECK_EQ(val1, val2) CHECK_EQ(val1, val2)
+#define TORCH_CHECK_NE(val1, val2) CHECK_NE(val1, val2)
+#define TORCH_CHECK_LE(val1, val2) CHECK_LE(val1, val2)
+#define TORCH_CHECK_LT(val1, val2) CHECK_LT(val1, val2)
+#define TORCH_CHECK_GE(val1, val2) CHECK_GE(val1, val2)
+#define TORCH_CHECK_GT(val1, val2) CHECK_GT(val1, val2)

-[[noreturn]] void ThrowEnforceNotMet(
-    const char* file,
-    const int line,
-    const char* condition,
-    const std::string& msg,
-    const void* caller);
+#ifndef NDEBUG
+#define TORCH_DCHECK_EQ(val1, val2) DCHECK_EQ(val1, val2)
+#define TORCH_DCHECK_NE(val1, val2) DCHECK_NE(val1, val2)
+#define TORCH_DCHECK_LE(val1, val2) DCHECK_LE(val1, val2)
+#define TORCH_DCHECK_LT(val1, val2) DCHECK_LT(val1, val2)
+#define TORCH_DCHECK_GE(val1, val2) DCHECK_GE(val1, val2)
+#define TORCH_DCHECK_GT(val1, val2) DCHECK_GT(val1, val2)
+#else // !NDEBUG
+// These versions generate no code in optimized mode.
+#define TORCH_DCHECK_EQ(val1, val2) \
+  while (false)                     \
+  DCHECK_EQ(val1, val2)
+#define TORCH_DCHECK_NE(val1, val2) \
+  while (false)                     \
+  DCHECK_NE(val1, val2)
+#define TORCH_DCHECK_LE(val1, val2) \
+  while (false)                     \
+  DCHECK_LE(val1, val2)
+#define TORCH_DCHECK_LT(val1, val2) \
+  while (false)                     \
+  DCHECK_LT(val1, val2)
+#define TORCH_DCHECK_GE(val1, val2) \
+  while (false)                     \
+  DCHECK_GE(val1, val2)
+#define TORCH_DCHECK_GT(val1, val2) \
+  while (false)                     \
+  DCHECK_GT(val1, val2)
+#endif // NDEBUG

-template <typename T>
-T& CheckNotNullCommon(
-    const char* file,
-    int line,
-    const char* names,
-    T& t,
-    bool fatal) {
-  if (t == nullptr) {
-    MessageLogger(file, line, ::google::GLOG_FATAL, fatal).stream()
-        << "Check failed: '" << names << "' must be non NULL. ";
-  }
-  return t;
-}
+// Check that a pointer is not null.
+#define TORCH_CHECK_NOTNULL(val) CHECK_NOTNULL(val)

-template <typename T>
-T* CheckNotNull(
-    const char* file,
-    int line,
-    const char* names,
-    T* t,
-    bool fatal) {
-  return CheckNotNullCommon(file, line, names, t, fatal);
-}
-
-template <typename T>
-T& CheckNotNull(
-    const char* file,
-    int line,
-    const char* names,
-    T& t,
-    bool fatal) {
-  return CheckNotNullCommon(file, line, names, t, fatal);
-}
-
-} // namespace c10
+#ifndef NDEBUG
+// Debug only version of TORCH_CHECK_NOTNULL
+#define TORCH_DCHECK_NOTNULL(val) DCHECK_NOTNULL(val)
+#else // !NDEBUG
+// Optimized version - generates no code.
+#define TORCH_DCHECK_NOTNULL(val) \
+  while (false)                   \
+  DCHECK_NOTNULL(val)
+#endif // NDEBUG

 // Log with source location information override (to be used in generic
 // warning/error handlers implemented as functions, not macros)
--- a/c10/util/logging_is_not_google_glog.h
+++ b/c10/util/logging_is_not_google_glog.h
@ -13,7 +13,6 @@
 #include <vector>

 #include <c10/util/Flags.h>
-#include <c10/util/logging_common.h>

 const char CAFFE2_SEVERITY_PREFIX[] = "FEWIV";

@ -25,40 +24,61 @@ const int GLOG_ERROR = 2;
 const int GLOG_WARNING = 1;
 const int GLOG_INFO = 0;

+class C10_API MessageLogger {
+ public:
+  MessageLogger(const char* file, int line, int severity);
+  ~MessageLogger();
+  // Return the stream associated with the logger object.
+  std::stringstream& stream() {
+    return stream_;
+  }
+
+ private:
+  // When there is a fatal log, we simply abort.
+  void DealWithFatal() {
+    abort();
+  }
+
+  const char* tag_;
+  std::stringstream stream_;
+  int severity_;
+};
+
+// This class is used to explicitly ignore values in the conditional
+// logging macros.  This avoids compiler warnings like "value computed
+// is not used" and "statement has no effect".
+class C10_API LoggerVoidify {
+ public:
+  LoggerVoidify() = default;
+  // This has to be an operator with a precedence lower than << but
+  // higher than ?:
+  void operator&(const std::ostream& s [[maybe_unused]]) {}
+};
+
+// Log a message and terminate.
+template <class T>
+void LogMessageFatal(const char* file, int line, const T& message) {
+  MessageLogger(file, line, GLOG_FATAL).stream() << message;
+}
+
 // Helpers for TORCH_CHECK_NOTNULL(). Two are necessary to support both raw
 // pointers and smart pointers.
 template <typename T>
-T& CheckNotNullCommon(
-    const char* file,
-    int line,
-    const char* names,
-    T& t,
-    bool fatal) {
+T& CheckNotNullCommon(const char* file, int line, const char* names, T& t) {
  if (t == nullptr) {
-    MessageLogger(file, line, GLOG_FATAL, fatal).stream()
-        << "Check failed: '" << names << "' must be non NULL. ";
+    LogMessageFatal(file, line, std::string(names));
  }
  return t;
 }

 template <typename T>
-T* CheckNotNull(
-    const char* file,
-    int line,
-    const char* names,
-    T* t,
-    bool fatal) {
-  return CheckNotNullCommon(file, line, names, t, fatal);
+T* CheckNotNull(const char* file, int line, const char* names, T* t) {
+  return CheckNotNullCommon(file, line, names, t);
 }

 template <typename T>
-T& CheckNotNull(
-    const char* file,
-    int line,
-    const char* names,
-    T& t,
-    bool fatal) {
-  return CheckNotNullCommon(file, line, names, t, fatal);
+T& CheckNotNull(const char* file, int line, const char* names, T& t) {
+  return CheckNotNullCommon(file, line, names, t);
 }
 } // namespace c10

@ -116,6 +136,65 @@ static_assert(
          ::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_##n).stream()
 #endif // NDEBUG

+#define TORCH_CHECK_OP(val1, val2, op)                                        \
+  FATAL_IF(((val1)op(val2))) << "Check failed: " #val1 " " #op " " #val2 " (" \
+                             << (val1) << " vs. " << (val2) << ") "
+
+// TORCH_CHECK_OP macro definitions
+#define TORCH_CHECK_EQ(val1, val2) TORCH_CHECK_OP(val1, val2, ==)
+#define TORCH_CHECK_NE(val1, val2) TORCH_CHECK_OP(val1, val2, !=)
+#define TORCH_CHECK_LE(val1, val2) TORCH_CHECK_OP(val1, val2, <=)
+#define TORCH_CHECK_LT(val1, val2) TORCH_CHECK_OP(val1, val2, <)
+#define TORCH_CHECK_GE(val1, val2) TORCH_CHECK_OP(val1, val2, >=)
+#define TORCH_CHECK_GT(val1, val2) TORCH_CHECK_OP(val1, val2, >)
+
+#ifndef NDEBUG
+// Debug only versions of TORCH_CHECK_OP macros.
+#define TORCH_DCHECK_EQ(val1, val2) TORCH_CHECK_OP(val1, val2, ==)
+#define TORCH_DCHECK_NE(val1, val2) TORCH_CHECK_OP(val1, val2, !=)
+#define TORCH_DCHECK_LE(val1, val2) TORCH_CHECK_OP(val1, val2, <=)
+#define TORCH_DCHECK_LT(val1, val2) TORCH_CHECK_OP(val1, val2, <)
+#define TORCH_DCHECK_GE(val1, val2) TORCH_CHECK_OP(val1, val2, >=)
+#define TORCH_DCHECK_GT(val1, val2) TORCH_CHECK_OP(val1, val2, >)
+#else // !NDEBUG
+// These versions generate no code in optimized mode.
+#define TORCH_DCHECK_EQ(val1, val2) \
+  while (false)                     \
+  TORCH_CHECK_OP(val1, val2, ==)
+#define TORCH_DCHECK_NE(val1, val2) \
+  while (false)                     \
+  TORCH_CHECK_OP(val1, val2, !=)
+#define TORCH_DCHECK_LE(val1, val2) \
+  while (false)                     \
+  TORCH_CHECK_OP(val1, val2, <=)
+#define TORCH_DCHECK_LT(val1, val2) \
+  while (false)                     \
+  TORCH_CHECK_OP(val1, val2, <)
+#define TORCH_DCHECK_GE(val1, val2) \
+  while (false)                     \
+  TORCH_CHECK_OP(val1, val2, >=)
+#define TORCH_DCHECK_GT(val1, val2) \
+  while (false)                     \
+  TORCH_CHECK_OP(val1, val2, >)
+#endif // NDEBUG
+
+// Check that a pointer is not null.
+#define TORCH_CHECK_NOTNULL(val) \
+  ::c10::CheckNotNull(           \
+      __FILE__, __LINE__, "Check failed: '" #val "' Must be non NULL", (val))
+
+#ifndef NDEBUG
+// Debug only version of TORCH_CHECK_NOTNULL
+#define TORCH_DCHECK_NOTNULL(val) \
+  ::c10::CheckNotNull(            \
+      __FILE__, __LINE__, "Check failed: '" #val "' Must be non NULL", (val))
+#else // !NDEBUG
+// Optimized version - generates no code.
+#define TORCH_DCHECK_NOTNULL(val) \
+  while (false)                   \
+  TORCH_CHECK_NOTNULL(val)
+#endif // NDEBUG
+
 // ---------------------- Support for std objects --------------------------
 // These are adapted from glog to support a limited set of logging capability
 // for STL objects.
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Eli Uriegas	607000a867	Update (base update) [ghstack-poisoned]	2025-11-05 17:05:14 +00:00
Eli Uriegas	f9d5ee396d	Update (base update) [ghstack-poisoned]	2025-10-28 09:57:49 -07:00