adding documentation

Add API to annotate disjoint backward and handle in AC (#166536 )
This adds zero-bubble / DualPipeV support for (S)AC Before: - AC will always retrigger recompute upon every distinct backward. After: - Any checkpointed regions encountered by backward under the same instance of this context manager will only trigger recompute at most once, even if there are multiple calls to backward. - Backward calls under the same instance of this context manager must execute over non-overlapping regions of the backward graph even if retain_graph=True. Pull Request resolved: https://github.com/pytorch/pytorch/pull/166536 Approved by: https://github.com/albanD
2025-11-12 23:54:43 +08:00 · 2025-11-07 16:31:53 -08:00 · 2025-11-08 00:21:25 +00:00 · 2025-11-08 00:13:03 +00:00 · 2025-11-07 23:58:13 +00:00 · 2025-11-07 23:58:13 +00:00
499 changed files with 10575 additions and 4116 deletions
--- a/.ci/docker/almalinux/build.sh
+++ b/.ci/docker/almalinux/build.sh
@ -36,11 +36,7 @@ case ${DOCKER_TAG_PREFIX} in
    ;;
  rocm*)
    BASE_TARGET=rocm
-    PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-    # add gfx950, gfx115x conditionally starting in ROCm 7.0
-    if [[ "$ROCM_VERSION" == *"7.0"* ]]; then
-        PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
-    fi
+    PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151"
    EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
    ;;
  *)
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -168,6 +168,18 @@ case "$tag" in
    VISION=yes
    TRITON=yes
    ;;
+  pytorch-linux-jammy-py3.11-clang12)
+    ANACONDA_PYTHON_VERSION=3.11
+    CLANG_VERSION=12
+    VISION=no
+    TRITON=no
+    ;;
+  pytorch-linux-jammy-py3.12-clang12)
+    ANACONDA_PYTHON_VERSION=3.12
+    CLANG_VERSION=12
+    VISION=no
+    TRITON=no
+    ;;
  pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-jammy-rocm-n-py3-benchmarks | pytorch-linux-noble-rocm-n-py3)
    if [[ $tag =~ "jammy" ]]; then
      ANACONDA_PYTHON_VERSION=3.10
@ -195,9 +207,9 @@ case "$tag" in
    NINJA_VERSION=1.9.0
    TRITON=yes
    ;;
-  pytorch-linux-jammy-xpu-n-py3 | pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks)
+  pytorch-linux-noble-xpu-n-py3 | pytorch-linux-noble-xpu-n-py3-inductor-benchmarks)
    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=11
+    GCC_VERSION=13
    VISION=yes
    XPU_VERSION=2025.2
    NINJA_VERSION=1.9.0
@ -248,6 +260,12 @@ case "$tag" in
    HALIDE=yes
    TRITON=yes
    ;;
+  pytorch-linux-jammy-cuda13.0-py3.12-pallas)
+    CUDA_VERSION=13.0.0
+    ANACONDA_PYTHON_VERSION=3.12
+    GCC_VERSION=11
+    PALLAS=yes
+    ;;
  pytorch-linux-jammy-py3.12-triton-cpu)
    CUDA_VERSION=12.6
    ANACONDA_PYTHON_VERSION=3.12
@ -369,6 +387,7 @@ docker build \
       --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
       --build-arg "EXECUTORCH=${EXECUTORCH}" \
       --build-arg "HALIDE=${HALIDE}" \
+       --build-arg "PALLAS=${PALLAS}" \
       --build-arg "XPU_VERSION=${XPU_VERSION}" \
       --build-arg "UNINSTALL_DILL=${UNINSTALL_DILL}" \
       --build-arg "ACL=${ACL:-}" \
--- a/.ci/docker/ci_commit_pins/jax.txt
+++ b/.ci/docker/ci_commit_pins/jax.txt
@ -0,0 +1 @@
+0.8.0
--- a/.ci/docker/common/install_jax.sh
+++ b/.ci/docker/common/install_jax.sh
@ -0,0 +1,40 @@
+#!/bin/bash
+
+set -ex
+
+source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
+
+# Get the pinned JAX version (same for all CUDA versions)
+JAX_VERSION=$(get_pinned_commit /ci_commit_pins/jax)
+
+function install_jax_12() {
+  echo "Installing JAX ${JAX_VERSION} with CUDA 12 support"
+  pip_install "jax[cuda12]==${JAX_VERSION}" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+
+  # Verify installation
+  python -c "import jax"  # check for errors
+  echo "JAX ${JAX_VERSION} installation completed successfully for CUDA 12"
+}
+
+function install_jax_13() {
+  echo "Installing JAX ${JAX_VERSION} with CUDA 13 support"
+  pip_install "jax[cuda13]==${JAX_VERSION}" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+
+  # Verify installation
+  python -c "import jax"  # check for errors
+  echo "JAX ${JAX_VERSION} installation completed successfully for CUDA 13"
+}
+
+# idiomatic parameter and option handling in sh
+while test $# -gt 0
+do
+    case "$1" in
+    12.4|12.6|12.6.*|12.8|12.8.*|12.9|12.9.*) install_jax_12;
+        ;;
+    13.0|13.0.*) install_jax_13;
+        ;;
+    *) echo "bad argument $1"; exit 1
+        ;;
+    esac
+    shift
+done
--- a/.ci/docker/common/install_libgomp.sh
+++ b/.ci/docker/common/install_libgomp.sh
@ -0,0 +1,56 @@
+#!/bin/bash
+# Script used only in CD pipeline
+
+set -ex
+
+# install dependencies
+dnf -y install gmp-devel libmpc-devel texinfo flex bison
+
+cd /usr/local/src
+# fetch source for gcc 13
+git clone --depth 1 --single-branch -b releases/gcc-13.3.0 https://github.com/gcc-mirror/gcc.git gcc-13.3.0
+
+mkdir -p gcc-13.3.0/build-gomp
+cd gcc-13.3.0/build-gomp
+
+# configure gcc build
+# I got these flags by:
+# 1. downloading the source rpm for gcc-11 on AlmaLinux 8 container
+#    dnf install -y dnf-plugins-core rpmdevtools
+#   dnf download --source libgomp
+# 2. extracting the gcc.spec from the source.
+#    rpmdev-extract gcc-xx.src.rpm
+# 3. extracting optflags and ld_flags from gcc.spec:
+#    rpm --eval '%{optflags}'
+#    rpm --eval '%{build_ldflags}'
+#
+# I had to remove the following flags because they didn't compile for this version of libgomp:
+#   -Werror=format-security
+#   -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1
+#   -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1
+#
+# I added -march=armv8-a -mtune=generic to make them explicit. I don't think they're strictly needed.
+
+OPT_FLAGS='-O2 -march=armv8-a -mtune=generic'\
+' -fexceptions -g -grecord-gcc-switches -pipe -Wall'\
+' -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS'\
+' -fstack-protector-strong -fasynchronous-unwind-tables'\
+' -fstack-clash-protection'
+
+LDFLAGS='-Wl,-z,relro -Wl,--as-needed -Wl,-z,now'
+
+CFLAGS="$OPT_FLAGS" \
+CXXFLAGS="$OPT_FLAGS" \
+LDFLAGS="$LDFLAGS" \
+../configure \
+  --prefix=/usr \
+  --libdir=/usr/lib64 \
+  --enable-languages=c,c++ \
+  --disable-multilib \
+  --disable-bootstrap \
+  --enable-libgomp
+
+# only build libgomp
+make -j$(nproc) all-target-libgomp
+
+make install-target-libgomp
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -9,7 +9,7 @@ set -xe

 function install_ubuntu() {
    . /etc/os-release
-    if [[ ! " jammy " =~ " ${VERSION_CODENAME} " ]]; then
+    if [[ ! " jammy noble " =~ " ${VERSION_CODENAME} " ]]; then
        echo "Ubuntu version ${VERSION_CODENAME} not supported"
        exit
    fi
@ -35,25 +35,24 @@ function install_ubuntu() {
    # The xpu-smi packages
    apt-get install -y flex bison xpu-smi

-    if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
-        # Compute and Media Runtimes
+    # Compute and Media Runtimes
+    if [[ " ${VERSION_CODENAME} " =~ " noble " ]]; then
        apt-get install -y \
-            intel-opencl-icd intel-level-zero-gpu level-zero \
-            intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \
-            libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
+            intel-opencl-icd libze-intel-gpu1 libze1 \
+            intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
+            libegl-mesa0 libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
            libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
-            mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
-        # Development Packages
-        apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
-    else # rolling driver
+            mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc
+    else # jammy
        apt-get install -y \
            intel-opencl-icd libze-intel-gpu1 libze1 \
            intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
            libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
            libglapi-mesa libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
            mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc
-        apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev
    fi
+    # Development Packages
+    apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev

    # Install Intel Support Packages
    apt-get install -y ${XPU_PACKAGES}
@ -66,7 +65,7 @@ function install_ubuntu() {
 function install_rhel() {
    . /etc/os-release
    if [[ "${ID}" == "rhel" ]]; then
-        if [[ ! " 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
+        if [[ ! " 8.8 8.10 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
            echo "RHEL version ${VERSION_ID} not supported"
            exit
        fi
@ -147,7 +146,7 @@ function install_sles() {
 XPU_DRIVER_VERSION=""
 if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
    # Use GPU driver LTS releases
-    XPU_DRIVER_VERSION="/lts/2350"
+    XPU_DRIVER_VERSION="/lts/2523"
 fi

 # Default use Intel® oneAPI Deep Learning Essentials 2025.1
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -49,11 +49,7 @@ case ${DOCKER_TAG_PREFIX} in
        fi
        BASE_TARGET=rocm
        GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
-        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-        # add gfx950, gfx115x conditionally starting in ROCm 7.0
-        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
-            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
-        fi
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151"
        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
        ;;
    *)
--- a/.ci/docker/manywheel/Dockerfile_2_28_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64
@ -50,6 +50,10 @@ RUN rm install_ninja.sh
 ENV PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH

+# Build a newer version of libgomp than that supported in in Almalinux 8.
+COPY ./common/install_libgomp.sh install_libgomp.sh
+RUN bash ./install_libgomp.sh && rm install_libgomp.sh
+
 # git236+ would refuse to run git commands in repos owned by other users
 # Which causes version check to fail, as pytorch repo is bind-mounted into the image
 # Override this behaviour by treating every folder as safe
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -87,11 +87,7 @@ case ${image} in
        MANY_LINUX_VERSION="2_28"
        DEVTOOLSET_VERSION="11"
        GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
-        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-        # add gfx950, gfx115x conditionally starting in ROCm 7.0
-        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
-            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
-        fi
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151"
        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
        ;;
    manylinux2_28-builder:xpu)
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,15 +1,11 @@
-sphinx==5.3.0
+sphinx==7.2.6
 #Description: This is used to generate PyTorch docs
-#Pinned versions: 5.3.0
+#Pinned versions: 7.2.6

-standard-imghdr==3.13.0; python_version >= "3.13"
-#Description: This is needed by Sphinx, so it needs to be added here.
-# The reasons are as follows:
-# 1) This module has been removed from the Python standard library since Python 3.13(https://peps.python.org/pep-0594/#imghdr);
-# 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13.
-# Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency.
+pytorch_sphinx_theme2==0.2.0
+#Description: This is needed to generate PyTorch docs
+#Pinned versions: 0.2.0

-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@71e55749be14ceb56e7f8211a9fb649866b87ad4#egg=pytorch_sphinx_theme2
 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
 # something related to Docker setup. We can investigate this later.
@ -36,17 +32,17 @@ tensorboard==2.18.0 ; python_version >= "3.13"
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 2.13.0

-breathe==4.34.0
+breathe==4.36.0
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 4.34.0
+#Pinned versions: 4.36.0

-exhale==0.2.3
+exhale==0.3.7
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 0.2.3
+#Pinned versions: 0.3.7

-docutils==0.16
+docutils==0.20
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 0.16
+#Pinned versions: 0.20

 bs4==0.0.1
 #Description: This is used to generate PyTorch C++ docs
@ -56,13 +52,13 @@ IPython==8.12.0
 #Description: This is used to generate PyTorch functorch docs
 #Pinned versions: 8.12.0

-myst-nb==0.17.2
+myst-nb==1.3.0
 #Description: This is used to generate PyTorch functorch and torch.compile docs.
-#Pinned versions: 0.17.2
+#Pinned versions: 1.3.0

 # The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
 python-etcd==0.4.5
 sphinx-copybutton==0.5.0
-sphinx-design==0.4.0
+sphinx-design==0.6.1
 sphinxcontrib-mermaid==1.0.0
-myst-parser==0.18.1
+myst-parser==4.0.1
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -143,6 +143,15 @@ COPY ci_commit_pins/halide.txt halide.txt
 RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
 RUN rm install_halide.sh common_utils.sh halide.txt

+ARG PALLAS
+ARG CUDA_VERSION
+# Install JAX with CUDA support (for Pallas)
+COPY ./common/install_jax.sh install_jax.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./ci_commit_pins/jax.txt /ci_commit_pins/jax.txt
+RUN if [ -n "${PALLAS}" ]; then bash ./install_jax.sh ${CUDA_VERSION}; fi
+RUN rm -f install_jax.sh common_utils.sh /ci_commit_pins/jax.txt
+
 ARG ONNX
 # Install ONNX dependencies
 COPY ./common/install_onnx.sh ./common/common_utils.sh ./
--- a/.ci/pytorch/python_doc_push_script.sh
+++ b/.ci/pytorch/python_doc_push_script.sh
@ -89,23 +89,41 @@ if [ "$is_main_doc" = true ]; then

  make coverage
  # Now we have the coverage report, we need to make sure it is empty.
-  # Count the number of lines in the file and turn that number into a variable
-  # $lines. The `cut -f1 ...` is to only parse the number, not the filename
-  # Skip the report header by subtracting 2: the header will be output even if
-  # there are no undocumented items.
+  # Sphinx 7.2.6+ format: python.txt contains a statistics table with a TOTAL row
+  # showing the undocumented count in the third column.
+  # Example: | TOTAL | 99.83% | 2 |
  #
  # Also: see docs/source/conf.py for "coverage_ignore*" items, which should
  # be documented then removed from there.
-  lines=$(wc -l build/coverage/python.txt 2>/dev/null |cut -f1 -d' ')
-  undocumented=$((lines - 2))
-  if [ $undocumented -lt 0 ]; then
+
+  # Extract undocumented count from TOTAL row in Sphinx 7.2.6 statistics table
+  # The table format is: | Module | Coverage | Undocumented |
+  # Extract the third column (undocumented count) from the TOTAL row
+  undocumented=$(grep "| TOTAL" build/coverage/python.txt | awk -F'|' '{print $4}' | tr -d ' ')
+
+  if [ -z "$undocumented" ] || ! [[ "$undocumented" =~ ^[0-9]+$ ]]; then
    echo coverage output not found
    exit 1
-  elif [ $undocumented -gt 0 ]; then
-    echo undocumented objects found:
-    cat build/coverage/python.txt
+  elif [ "$undocumented" -gt 0 ]; then
+    set +x  # Disable command echoing for cleaner output
+    echo ""
+    echo "====================="
+    echo "UNDOCUMENTED OBJECTS:"
+    echo "====================="
+    echo ""
+    # Find the line number of the TOTAL row and print only what comes after it
+    total_line=$(grep -n "| TOTAL" build/coverage/python.txt | cut -d: -f1)
+    if [ -n "$total_line" ]; then
+      # Print only the detailed list (skip the statistics table)
+      tail -n +$((total_line + 2)) build/coverage/python.txt
+    else
+      # Fallback to showing entire file if TOTAL line not found
+      cat build/coverage/python.txt
+    fi
+    echo ""
    echo "Make sure you've updated relevant .rsts in docs/source!"
-    echo "You can reproduce locally by running 'cd docs && make coverage && cat build/coverage/python.txt'"
+    echo "You can reproduce locally by running 'cd docs && make coverage && tail -n +\$((grep -n \"| TOTAL\" build/coverage/python.txt | cut -d: -f1) + 2)) build/coverage/python.txt'"
+    set -x  # Re-enable command echoing
    exit 1
  fi
 else
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -824,6 +824,11 @@ test_inductor_halide() {
  assert_git_not_dirty
 }

+test_inductor_pallas() {
+  python test/run_test.py --include inductor/test_pallas.py --verbose
+  assert_git_not_dirty
+}
+
 test_inductor_triton_cpu() {
  python test/run_test.py --include inductor/test_triton_cpu_backend.py inductor/test_torchinductor_strided_blocks.py --verbose
  assert_git_not_dirty
@ -1724,6 +1729,8 @@ elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
  test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
  test_inductor_halide
+elif [[ "${TEST_CONFIG}" == *inductor-pallas* ]]; then
+  test_inductor_pallas
 elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then
  test_inductor_triton_cpu
 elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
--- a/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1
+++ b/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1
@ -70,7 +70,7 @@ sccache --zero-stats
 sccache --show-stats

 # Build the wheel
-python -m build --wheel --no-build-isolation
+python -m build --wheel --no-isolation
 if ($LASTEXITCODE -ne 0) { exit 1 }

 # Install the wheel locally
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@ -1 +1 @@
-cfbc5c2f1c798991715a6b06bb3ce46478c4487c
+ca2212438fdd8ce29b66999ed70ed54b0f9372d1
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -138,7 +138,8 @@
 - test/test_matmul_cuda.py
 - test/test_scaled_matmul_cuda.py
 - test/inductor/test_fp8.py
- aten/src/ATen/native/cuda/Blas.cpp
+- aten/src/ATen/native/cuda/*Blas.cpp
+- aten/src/ATen/cuda/CUDA*Blas.*
 - torch/**/*cublas*
 - torch/_inductor/kernel/mm.py
 - test/inductor/test_max_autotune.py
@ -148,7 +149,8 @@
 - test/test_matmul_cuda.py
 - test/test_scaled_matmul_cuda.py
 - test/inductor/test_fp8.py
- aten/src/ATen/native/cuda/Blas.cpp
+- aten/src/ATen/native/cuda/*Blas.cpp
+- aten/src/ATen/cuda/CUDA*Blas.*
 - torch/**/*cublas*
 - torch/_inductor/kernel/mm.py
 - test/inductor/test_max_autotune.py
@ -158,7 +160,8 @@
 - test/test_matmul_cuda.py
 - test/test_scaled_matmul_cuda.py
 - test/inductor/test_fp8.py
- aten/src/ATen/native/cuda/Blas.cpp
+- aten/src/ATen/native/cuda/*Blas.cpp
+- aten/src/ATen/cuda/CUDA*Blas.*
 - torch/_inductor/kernel/mm.py
 - test/inductor/test_max_autotune.py
 - third_party/fbgemm
--- a/.github/nitpicks.yml
+++ b/.github/nitpicks.yml
@ -10,3 +10,4 @@
  pathFilter:
    - 'torch/csrc/inductor/aoti_torch/c/*'
    - 'torch/csrc/inductor/aoti_torch/generated/*'
+    - 'torch/csrc/stable/c/*'
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -2,8 +2,8 @@ tracking_issue: 24422
 ciflow_tracking_issue: 64124
 ciflow_push_tags:
 - ciflow/b200
- ciflow/b200-symm-mem
 - ciflow/b200-distributed
+- ciflow/b200-symm-mem
 - ciflow/binaries
 - ciflow/binaries_libtorch
 - ciflow/binaries_wheel
@ -22,6 +22,8 @@ ciflow_push_tags:
 - ciflow/inductor-perf-test-nightly-xpu
 - ciflow/inductor-periodic
 - ciflow/inductor-rocm
+- ciflow/inductor-rocm-mi200
+- ciflow/inductor-rocm-mi300
 - ciflow/linux-aarch64
 - ciflow/mps
 - ciflow/nightly
@ -33,11 +35,13 @@ ciflow_push_tags:
 - ciflow/quantization-periodic
 - ciflow/riscv64
 - ciflow/rocm
+- ciflow/rocm-mi200
 - ciflow/rocm-mi300
 - ciflow/rocm-mi355
 - ciflow/rocm-navi31
 - ciflow/s390
 - ciflow/slow
+- ciflow/slow-rocm-mi200
 - ciflow/torchbench
 - ciflow/triton_binaries
 - ciflow/trunk
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -56,6 +56,8 @@ jobs:
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
          pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
          pytorch-linux-jammy-py3.10-clang12,
+          pytorch-linux-jammy-py3.11-clang12,
+          pytorch-linux-jammy-py3.12-clang12,
          pytorch-linux-jammy-py3.13-clang12,
          pytorch-linux-jammy-py3.14-clang12,
          pytorch-linux-jammy-rocm-n-py3,
@ -65,9 +67,10 @@ jobs:
          pytorch-linux-jammy-py3.10-gcc11,
          pytorch-linux-jammy-py3-gcc11-inductor-benchmarks,
          pytorch-linux-jammy-py3.12-halide,
+          pytorch-linux-jammy-cuda13.0-py3.12-pallas,
          pytorch-linux-jammy-xpu-n-1-py3,
-          pytorch-linux-jammy-xpu-n-py3,
-          pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks,
+          pytorch-linux-noble-xpu-n-py3,
+          pytorch-linux-noble-xpu-n-py3-inductor-benchmarks,
          pytorch-linux-jammy-py3-clang18-asan,
          pytorch-linux-jammy-py3-clang12-onnx,
          pytorch-linux-jammy-linter,
--- a/.github/workflows/inductor-perf-test-nightly-xpu.yml
+++ b/.github/workflows/inductor-perf-test-nightly-xpu.yml
@ -83,8 +83,8 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-xpu-n-py3.10
-      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks
+      build-environment: linux-noble-xpu-n-py3.10
+      docker-image-name: ci-image:pytorch-linux-noble-xpu-n-py3-inductor-benchmarks
      runner: linux.c7i.12xlarge
      test-matrix: |
        { include: [
@ -117,7 +117,7 @@ jobs:
    uses: ./.github/workflows/_xpu-test.yml
    needs: xpu-n-py3_10-inductor-benchmark-build
    with:
-      build-environment: linux-jammy-xpu-n-py3.10
+      build-environment: linux-noble-xpu-n-py3.10
      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-false-cppwrapper-true-aotinductor-true-freezing_cudagraphs-false-cudagraphs_low_precision-false
      docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
      test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
@ -137,7 +137,7 @@ jobs:
    uses: ./.github/workflows/_xpu-test.yml
    needs: xpu-n-py3_10-inductor-benchmark-build
    with:
-      build-environment: linux-jammy-xpu-n-py3.10
+      build-environment: linux-noble-xpu-n-py3.10
      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
      docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
      test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
--- a/.github/workflows/inductor-rocm-mi200.yml
+++ b/.github/workflows/inductor-rocm-mi200.yml
@ -2,12 +2,12 @@ name: inductor-rocm

 on:
  schedule:
-    - cron: 0 * * * *
+    - cron: 0 */3 * * *
  push:
    branches:
      - release/*
    tags:
-      - ciflow/inductor-rocm/*
+      - ciflow/inductor-rocm-mi200/*
  workflow_dispatch:

 concurrency:
--- a/.github/workflows/inductor-rocm-mi300.yml
+++ b/.github/workflows/inductor-rocm-mi300.yml
@ -7,6 +7,7 @@ on:
      - release/*
    tags:
      - ciflow/inductor-rocm/*
+      - ciflow/inductor-rocm-mi300/*
  workflow_dispatch:

 concurrency:
--- a/.github/workflows/inductor-unittest.yml
+++ b/.github/workflows/inductor-unittest.yml
@ -81,6 +81,32 @@ jobs:
      test-matrix: ${{ needs.inductor-halide-build.outputs.test-matrix }}
    secrets: inherit

+  inductor-pallas-build:
+    name: inductor-pallas-build
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: linux-jammy-py3.12-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda13.0-py3.12-pallas
+      cuda-arch-list: '8.9'
+      runner: linux.8xlarge.memory
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      test-matrix: |
+        { include: [
+          { config: "inductor-pallas", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  inductor-pallas-test:
+    name: inductor-pallas-test
+    uses: ./.github/workflows/_linux-test.yml
+    needs: inductor-pallas-build
+    with:
+      build-environment: linux-jammy-py3.12-gcc11
+      docker-image: ${{ needs.inductor-pallas-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-pallas-build.outputs.test-matrix }}
+    secrets: inherit
+
  inductor-triton-cpu-build:
    name: inductor-triton-cpu-build
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/periodic-rocm-mi200.yml
+++ b/.github/workflows/periodic-rocm-mi200.yml
@ -11,7 +11,6 @@ on:
    - cron: 29 8 * * *  # about 1:29am PDT, for mem leak check and rerun disabled tests
  push:
    tags:
-      - ciflow/periodic/*
      - ciflow/periodic-rocm-mi200/*
    branches:
      - release/*
--- a/.github/workflows/periodic-rocm-mi300.yml
+++ b/.github/workflows/periodic-rocm-mi300.yml
@ -11,6 +11,7 @@ on:
    - cron: 29 8 * * *  # about 1:29am PDT, for mem leak check and rerun disabled tests
  push:
    tags:
+      - ciflow/periodic/*
      - ciflow/periodic-rocm-mi300/*
    branches:
      - release/*
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -342,16 +342,16 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-xpu-n-py3_10-build:
-    name: linux-jammy-xpu-n-py3.10
+  linux-noble-xpu-n-py3_10-build:
+    name: linux-noble-xpu-n-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      # This should sync with the build in xpu.yml but xpu uses a larger runner
      # sync-tag: linux-xpu-n-build
      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
-      build-environment: linux-jammy-xpu-n-py3.10
-      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
+      build-environment: linux-noble-xpu-n-py3.10
+      docker-image-name: ci-image:pytorch-linux-noble-xpu-n-py3
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 4, runner: "linux.idc.xpu" },
--- a/.github/workflows/rocm-mi200.yml
+++ b/.github/workflows/rocm-mi200.yml
@ -5,11 +5,12 @@ on:
    branches:
      - release/*
    tags:
-      - ciflow/rocm/*
+      - ciflow/rocm-mi200/*
  workflow_dispatch:
  schedule:
    - cron: 29 8 * * *  # about 1:29am PDT
-    - cron: 0 * * * *
+    - cron: 0 */3 * * *
+

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
--- a/.github/workflows/rocm-mi300.yml
+++ b/.github/workflows/rocm-mi300.yml
@ -6,6 +6,7 @@ on:
      - main
      - release/*
    tags:
+      - ciflow/rocm/*
      - ciflow/rocm-mi300/*
  workflow_dispatch:
  schedule:
--- a/.github/workflows/slow-rocm-mi200.yml
+++ b/.github/workflows/slow-rocm-mi200.yml
@ -0,0 +1,81 @@
+# This workflow is dedicated to host slow jobs that are run only periodically because
+# they are too slow to run in every commit.  The list of slow tests can be found in
+# https://github.com/pytorch/test-infra/blob/generated-stats/stats/slow-tests.json
+name: slow-rocm-mi200
+
+on:
+  push:
+    branches:
+      - release/*
+    tags:
+      - ciflow/slow/*
+      - ciflow/slow-rocm-mi200/*
+  schedule:
+    - cron: 0 */3 * * *
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  llm-td:
+    if: github.repository_owner == 'pytorch'
+    name: before-test
+    uses: ./.github/workflows/llm_td_retrieval.yml
+    permissions:
+      id-token: write
+      contents: read
+
+  target-determination:
+    name: before-test
+    uses: ./.github/workflows/target_determination.yml
+    needs: llm-td
+    permissions:
+      id-token: write
+      contents: read
+
+  get-label-type:
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
+  linux-jammy-rocm-py3_10-build:
+    name: linux-jammy-rocm-py3.10
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+      sync-tag: rocm-build
+      test-matrix: |
+        { include: [
+          { config: "slow", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
+          { config: "slow", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
+        ]}
+    secrets: inherit
+
+  linux-jammy-rocm-py3_10-test:
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-jammy-rocm-py3.10
+    uses: ./.github/workflows/_rocm-test.yml
+    needs:
+      - linux-jammy-rocm-py3_10-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
+    secrets: inherit
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -105,36 +105,6 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-rocm-py3_10-build:
-    name: linux-jammy-rocm-py3.10
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-rocm-py3.10
-      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
-      test-matrix: |
-        { include: [
-          { config: "slow", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
-          { config: "slow", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
-        ]}
-    secrets: inherit
-
-  linux-jammy-rocm-py3_10-test:
-    permissions:
-      id-token: write
-      contents: read
-    name: linux-jammy-rocm-py3.10
-    uses: ./.github/workflows/_rocm-test.yml
-    needs:
-      - linux-jammy-rocm-py3_10-build
-      - target-determination
-    with:
-      build-environment: linux-jammy-rocm-py3.10
-      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
-    secrets: inherit
-
  linux-jammy-py3_10-clang18-asan-build:
    name: linux-jammy-py3.10-clang18-asan
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@ -11,15 +11,16 @@ on:
      - inductor
      - unstable
      - slow
+      - slow-rocm-mi200
      - unstable-periodic
      - inductor-periodic
-      - rocm
+      - rocm-mi200
      - rocm-mi300
      - rocm-mi355
      - inductor-micro-benchmark
      - inductor-micro-benchmark-x86
      - inductor-cu124
-      - inductor-rocm
+      - inductor-rocm-mi200
      - inductor-rocm-mi300
      - mac-mps
      - linux-aarch64
--- a/.github/workflows/xpu.yml
+++ b/.github/workflows/xpu.yml
@ -47,15 +47,15 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-xpu-n-py3_10-build:
-    name: linux-jammy-xpu-n-py3.10
+  linux-noble-xpu-n-py3_10-build:
+    name: linux-noble-xpu-n-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      sync-tag: linux-xpu-n-build
      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
-      build-environment: linux-jammy-xpu-n-py3.10
-      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
+      build-environment: linux-noble-xpu-n-py3.10
+      docker-image-name: ci-image:pytorch-linux-noble-xpu-n-py3
      runner: linux.c7i.12xlarge
      test-matrix: |
        { include: [
@ -74,17 +74,17 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-xpu-n-py3_10-test:
-    name: linux-jammy-xpu-n-py3.10
+  linux-noble-xpu-n-py3_10-test:
+    name: linux-noble-xpu-n-py3.10
    uses: ./.github/workflows/_xpu-test.yml
-    needs: linux-jammy-xpu-n-py3_10-build
+    needs: linux-noble-xpu-n-py3_10-build
    permissions:
      id-token: write
      contents: read
    with:
-      build-environment: linux-jammy-xpu-n-py3.10
-      docker-image: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.test-matrix }}
+      build-environment: linux-noble-xpu-n-py3.10
+      docker-image: ${{ needs.linux-noble-xpu-n-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-noble-xpu-n-py3_10-build.outputs.test-matrix }}
    secrets: inherit

  windows-xpu-n-1-build:
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -143,7 +143,8 @@ init_command = [
    'tools/linter/adapters/pip_init.py',
    '--dry-run={{DRYRUN}}',
    'numpy==1.26.4 ; python_version >= "3.10" and python_version <= "3.11"',
-    'numpy==2.1.0 ; python_version >= "3.12"',
+    'numpy==2.1.0 ; python_version >= "3.12" and python_version <= "3.13"',
+    'numpy==2.3.4 ; python_version >= "3.14"',
    'expecttest==0.3.0',
    'pyrefly==0.36.2',
    'sympy==1.13.3',
@ -1401,7 +1402,7 @@ init_command = [
    '--dry-run={{DRYRUN}}',
    'usort==1.0.8.post1',
    'isort==6.0.1',
-    'ruff==0.13.1',  # sync with RUFF
+    'ruff==0.14.4',  # sync with RUFF
 ]
 is_formatter = true

@ -1536,7 +1537,7 @@ init_command = [
    'python3',
    'tools/linter/adapters/pip_init.py',
    '--dry-run={{DRYRUN}}',
-    'ruff==0.13.1',  # sync with PYFMT
+    'ruff==0.14.4',  # sync with PYFMT
 ]
 is_formatter = true

--- a/6
+++ b/6
@ -210,8 +210,12 @@ torch/backends/cudnn/ @eqy @syed-ahmed @Aidyn-A
 /test/inductor/test_flex_attention.py @drisspg
 /test/inductor/test_flex_decoding.py @drisspg

-# Low Precision GEMMs
+# Low Precision & Grouped GEMMs
 /aten/src/ATen/native/cuda/Blas.cpp @drisspg @slayton58
+/aten/src/ATen/native/cuda/GroupedBlas.cpp @drisspg @slayton58
+/aten/src/ATen/native/cuda/ScaledBlas.cpp @drisspg @slayton58
 /aten/src/ATen/cuda/CUDABlas.cpp @drisspg @slayton58
 /aten/src/ATen/cuda/CUDABlas.h @drisspg @slayton58
+/aten/src/ATen/cuda/CUDAScaledBlas.cpp @drisspg @slayton58
+/aten/src/ATen/cuda/CUDAScaledBlas.h @drisspg @slayton58
 /test/test_scaled_matmul_cuda.py @drisspg @slayton58
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -174,6 +174,12 @@ class TORCH_API Context {
  static long versionCuDNN() {
    return detail::getCUDAHooks().versionCuDNN();
  }
+  static long versionRuntimeCuDNN() {
+    return detail::getCUDAHooks().versionRuntimeCuDNN();
+  }
+  static long versionCuDNNFrontend() {
+    return detail::getCUDAHooks().versionCuDNNFrontend();
+  }
  static bool hasCuSOLVER() {
    return detail::getCUDAHooks().hasCuSOLVER();
  }
--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@ -6,6 +6,7 @@
 #include <c10/util/Half.h>
 #include <c10/util/Metaprogramming.h>
 #include <c10/util/complex.h>
+#include <torch/headeronly/core/Dispatch.h>

 #ifdef __CUDACC__
 #include <cuda.h> // For CUDA_VERSION
@ -61,12 +62,9 @@ TORCH_API void record_kernel_function_dtype(std::string name);
    }                                                 \
  } while (0)

-#define AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, HINT, ...)                 \
-  case enum_type: {                                                           \
-    AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type);                              \
-    using HINT [[maybe_unused]] = c10::impl::ScalarTypeToCPPTypeT<enum_type>; \
-    return __VA_ARGS__();                                                     \
-  }
+#define AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, HINT, ...) \
+  THO_PRIVATE_CASE_TYPE_USING_HINT_TMPL(                      \
+      AT_PRIVATE_CHECK_SELECTIVE_BUILD, enum_type, HINT, __VA_ARGS__)

 #define AT_DISPATCH_CASE(enum_type, ...) \
  AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, scalar_t, __VA_ARGS__)
@ -95,14 +93,6 @@ TORCH_API void record_kernel_function_dtype(std::string name);
    return __VA_ARGS__();                                                   \
  }

-namespace detail {
-
-inline at::ScalarType scalar_type(at::ScalarType s) {
-  return s;
-}
-
-} // namespace detail
-
 // The AT_DISPATCH_* family of macros provides the ability to
 // conveniently generate specializations of a kernel over all of the
 // dtypes we care about in PyTorch.  We call it "dispatch" because
@ -190,27 +180,13 @@ inline at::ScalarType scalar_type(at::ScalarType s) {
 // but we're just being safe (and it doesn't hurt.)  Note we must
 // use it to shut up warnings about unused store.

-#define AT_DISPATCH_SWITCH(TYPE, NAME, ...)                                 \
-  [&] {                                                                     \
-    const auto& the_type = TYPE;                                            \
-    constexpr const char* at_dispatch_name = NAME;                          \
-    /* don't use TYPE again in case it is an expensive or side-effect op */ \
-    at::ScalarType _st = ::detail::scalar_type(the_type);                   \
-    RECORD_KERNEL_FUNCTION_DTYPE(at_dispatch_name, _st);                    \
-    C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum")             \
-    switch (_st) {                                                          \
-      __VA_ARGS__                                                           \
-      default:                                                              \
-        TORCH_CHECK_NOT_IMPLEMENTED(                                        \
-            false,                                                          \
-            '"',                                                            \
-            at_dispatch_name,                                               \
-            "\" not implemented for '",                                     \
-            toString(_st),                                                  \
-            "'");                                                           \
-    }                                                                       \
-    C10_DIAGNOSTIC_POP()                                                    \
-  }()
+#define AT_DISPATCH_SWITCH(TYPE, NAME, ...) \
+  THO_DISPATCH_SWITCH_TMPL(                 \
+      RECORD_KERNEL_FUNCTION_DTYPE,         \
+      TORCH_CHECK_NOT_IMPLEMENTED,          \
+      TYPE,                                 \
+      NAME,                                 \
+      __VA_ARGS__)

 #define AT_DISPATCH_CASE_FLOATING_TYPES(...)            \
  AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \
--- a/aten/src/ATen/Dispatch_v2.h
+++ b/aten/src/ATen/Dispatch_v2.h
@ -1,3 +1,8 @@
+#pragma once
+
+#include <torch/headeronly/core/Dispatch_v2.h>
+
+// Get AT_DISPATCH_SWITCH and AT_DISPATCH_CASE:
 #include <ATen/Dispatch.h>

 // This is a new implementation of the AT_DISPATCH macro family from
@ -74,41 +79,19 @@
 // macro expansion occurs, mediated with AT_EXPAND and AT_GUARD.  I mostly
 // relied on GPT4 to help me get it right.

-// Public API macros
-
 // See documentation above
 #define AT_DISPATCH_V2(TYPE, NAME, BODY, ...) \
-  AT_DISPATCH_SWITCH(TYPE, NAME, AT_AP_VAR(AT_WRAP(BODY), TYPE, __VA_ARGS__))
-
-// This macro lets you pass an arbitrary expression that may contain internal
-// commas to another macro without having the commas causing the expression
-// to be interpreted as being multiple arguments
-#define AT_WRAP(...) __VA_ARGS__
-
-#define AT_FLOAT8_TYPES                                          \
-  c10::kFloat8_e5m2, c10::kFloat8_e5m2fnuz, c10::kFloat8_e4m3fn, \
-      c10::kFloat8_e4m3fnuz, c10::kFloat8_e8m0fnu
-
-#define AT_INTEGRAL_TYPES \
-  c10::kByte, c10::kChar, c10::kInt, c10::kLong, c10::kShort
-#define AT_FLOATING_TYPES c10::kDouble, c10::kFloat
-#define AT_BAREBONES_UNSIGNED_TYPES c10::kUInt16, c10::kUInt32, c10::kUInt64
-#define AT_INTEGRAL_TYPES_V2 \
-  AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)
-#define AT_COMPLEX_TYPES c10::kComplexDouble, c10::kComplexFloat
-#define AT_QINT_TYPES c10::kQInt8, c10::kQUInt8, c10::kQInt32
-// NB: not *actually* all types
-#define AT_ALL_TYPES AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_FLOATING_TYPES)
-#define AT_ALL_TYPES_AND_COMPLEX \
-  AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_COMPLEX_TYPES)
-
-// Helper macros
+  THO_DISPATCH_V2_TMPL(                       \
+      AT_DISPATCH_SWITCH,                     \
+      AT_DISPATCH_CASE,                       \
+      TYPE,                                   \
+      NAME,                                   \
+      AT_WRAP(BODY),                          \
+      __VA_ARGS__)

+// Unused helper macros, kept for BC:
 #define AT_AP_VAR(N, T, ...) \
  AT_EXPAND(AT_CONCAT(AT_AP, AT_NUM_ARGS(__VA_ARGS__))(AT_WRAP(N), __VA_ARGS__))
-#define AT_CONCAT(a, b) AT_CONCAT_AUX(a, b)
-#define AT_CONCAT_AUX(a, b) a##b
-#define AT_EXPAND(X) X

 // Ensure we never have too many scalar types for the expansion here to
 // support.  To bump this, you must regenerate the macros below.
@ -119,12 +102,6 @@ static_assert(static_cast<int>(c10::ScalarType::NumOptions) < 60);

 num_args = 60

-nums = ', '.join(str(i) for i in reversed(range(num_args+1)))
-args = ', '.join(f'_{i}' for i in range(1, num_args+1))
-
-print(f'#define AT_NUM_ARGS(...) AT_EXPAND(AT_NUM_ARGS_AUX(__VA_ARGS__, {nums}))')
-print(f'#define AT_NUM_ARGS_AUX({args}, N, ...) N')
-
 for i in range(1, num_args+1):
    args = ', '.join(f'_{i}' for i in range(1, i+1))
    cases = ' '.join([f'AT_DISPATCH_CASE(_{j}, N)' for j in range(1, i+1)])
@ -135,8 +112,6 @@ for i in range(1, num_args+1):
 // Begin generated code
 // clang-format off

-#define AT_NUM_ARGS(...) AT_EXPAND(AT_NUM_ARGS_AUX(__VA_ARGS__, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0))
-#define AT_NUM_ARGS_AUX(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58, _59, _60, N, ...) N
 #define AT_AP1(N, _1) AT_DISPATCH_CASE(_1, N)
 #define AT_AP2(N, _1, _2) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N)
 #define AT_AP3(N, _1, _2, _3) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N)
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -1597,7 +1597,7 @@ bool gemm_and_bias(
  }

  using opmath_t = at::opmath_type<Dtype>;
-  opmath_t beta_val = 0; // bias is added in epilogue
+  opmath_t beta_val = bias ? 0 : 1; // bias is added in epilogue unless nullptr

  cudaDataType_t abType = CUDA_R_32F;
  cudaDataType_t cType = CUDA_R_32F;
@ -1686,15 +1686,22 @@ bool gemm_and_bias(
    _syncCurrentWithCarveoutStream(stream, true);
  }
 #endif
-  cublasLtEpilogue_t epilogue = CUBLASLT_EPILOGUE_BIAS;
-  if (activation == GEMMAndBiasActivationEpilogue::RELU) {
-    epilogue = CUBLASLT_EPILOGUE_RELU_BIAS;
-  } else if (activation == GEMMAndBiasActivationEpilogue::GELU) {
-    epilogue = CUBLASLT_EPILOGUE_GELU_BIAS;
-  }
+  const auto epilogue = [&]() -> cublasLtEpilogue_t {
+    // The cuBLAS documentation indicates that
+    // *_<ACTIVATION>_BIAS = *_<ACTIVATION>,
+    // but we keep it verbose here for clarity.
+    switch (activation) {
+      case GEMMAndBiasActivationEpilogue::RELU:
+        return bias ? CUBLASLT_EPILOGUE_RELU_BIAS : CUBLASLT_EPILOGUE_RELU;
+      case GEMMAndBiasActivationEpilogue::GELU:
+        return bias ? CUBLASLT_EPILOGUE_GELU_BIAS : CUBLASLT_EPILOGUE_GELU;
+      default:
+        return bias ? CUBLASLT_EPILOGUE_BIAS : CUBLASLT_EPILOGUE_DEFAULT;
+    }
+  }();
+  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, epilogue);

-  if (bias != nullptr) {
-    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, epilogue);
+  if (bias) {
    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_POINTER, bias);
  }

--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@ -21,6 +21,7 @@

 #if AT_CUDNN_ENABLED()
 #include <ATen/cudnn/cudnn-wrapper.h>
+#include <cudnn_frontend.h>
 #endif

 #if AT_MAGMA_ENABLED()
@ -351,6 +352,26 @@ long CUDAHooks::versionCuDNN() const {
 #endif
 }

+long CUDAHooks::versionRuntimeCuDNN() const {
+#if AT_CUDNN_ENABLED()
+#ifndef USE_STATIC_CUDNN
+  return cudnnGetVersion();
+#else
+  return CUDNN_VERSION;
+#endif
+#else
+  TORCH_CHECK(false, "Cannot query CuDNN version if ATen_cuda is not built with CuDNN");
+#endif
+}
+
+long CUDAHooks::versionCuDNNFrontend() const {
+#if AT_CUDNN_ENABLED()
+  return CUDNN_FRONTEND_VERSION;
+#else
+  TORCH_CHECK(false, "Cannot query CuDNN Frontend version if ATen_cuda is not built with CuDNN");
+#endif
+}
+
 long CUDAHooks::versionMIOpen() const {
 #if AT_ROCM_ENABLED()
  return MIOPEN_VERSION_MAJOR * 10000 +
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@ -49,6 +49,8 @@ struct CUDAHooks : public at::CUDAHooksInterface {
  bool hasCUDART() const override;
  long versionCUDART() const override;
  long versionCuDNN() const override;
+  long versionRuntimeCuDNN() const override;
+  long versionCuDNNFrontend() const override;
  long versionMIOpen() const override;
  std::string showConfig() const override;
  double batchnormMinEpsilonCuDNN() const override;
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@ -174,6 +174,14 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
    TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
  }

+  virtual long versionRuntimeCuDNN() const {
+    TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
+  }
+
+  virtual long versionCuDNNFrontend() const {
+    TORCH_CHECK(false, "Cannot query cuDNN Frontend version without ATen_cuda library. ", CUDA_HELP);
+  }
+
  virtual long versionMIOpen() const {
    TORCH_CHECK(false, "Cannot query MIOpen version without ATen_cuda library. ", CUDA_HELP);
  }
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -409,7 +409,7 @@ struct ConvParams {
    if (!detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda() || !cudnn_enabled) {
      return false;
    }
-    static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
+    static long cudnn_version = detail::getCUDAHooks().versionRuntimeCuDNN();
    // broken on cuDNN 9.8 - 9.14
    if (cudnn_version >= 90800 && cudnn_version < 91500) {
      if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous &&
@ -453,7 +453,7 @@ struct ConvParams {
    }
    // native kernel doesn't support 64-bit non-splittable case
    if (!(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) {
-      static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionCuDNN() : -1;
+      static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionRuntimeCuDNN() : -1;
      // TODO(eqy): remove this once cuDNN fixes 64-bit depthwise support, first broken in 9.11x
      if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) {
        if (cudnn_version < 0 || cudnn_version > 91000) {
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@ -50,18 +50,35 @@ static inline bool parseLinearFlatten3d() {
 // `_flatten_nd_linear` flattens all but the last dimension of the input tensor
 // before passing it to linear operation
 static inline Tensor _flatten_nd_linear(const Tensor& input, const Tensor& weight, const Tensor& bias) {
-    const auto input_sizes = input.sym_sizes();
-    // can't use -1 in reshape because it errors when a dimension is 0
-    c10::SymInt flattened_dim = 1;
-    for (int64_t i = 0, ndim = input_sizes.size(); i < ndim - 1; ++i) {
-      flattened_dim = flattened_dim * input_sizes[i];
+  const auto input_sizes = input.sym_sizes();
+
+  const auto result_flattened = [&]() -> Tensor {
+    const auto input_ncols = input_sizes.back();
+    const auto input_flattened_nrows = [&]() -> c10::SymInt {
+      // can't use -1 in reshape because it errors when a dimension is 0
+      auto flattened_nrows = c10::SymInt{1};
+      for (const auto& size : input_sizes.slice(0, input_sizes.size() - 1)) {
+        flattened_nrows *= size;
+      }
+      return flattened_nrows;
+    }();
+
+    const auto input_flattened = input.view_symint({input_flattened_nrows, input_ncols});
+    if (weight.layout() == c10::kStrided) {
+      return at::addmm(bias, input_flattened, weight.t());
+    } else {
+      // weight is sparse, and addmm for sparse expects matmul lhs to be sparse,
+      // so we transpose the problem.
+      // NOTE: at::matmul handles (dense @ sparse) similarly.
+      const auto bias_t = (bias.dim() >= 2) ? bias.mT() : bias.unsqueeze(-1);
+      return at::addmm(bias_t, weight, input_flattened.t()).t();
    }
-    auto inp_reshape = input.reshape_symint({flattened_dim, input_sizes.at(input_sizes.size() -1)});
-    const auto result = at::addmm(bias, inp_reshape, weight.t());
-    auto new_size = input_sizes.slice(0, input_sizes.size() - 1);
-    c10::SymDimVector sizes_vec(new_size.begin(), new_size.end());
-    sizes_vec.push_back(result.sym_size(1));
-    return result.view_symint(sizes_vec);
+  }();
+
+  // Unflatten flattened row dims
+  auto result_sizes = c10::SymDimVector{input_sizes.begin(), input_sizes.end()};
+  result_sizes.back() = result_flattened.sym_size(1);
+  return result_flattened.view_symint(result_sizes);
 }


@ -90,15 +107,23 @@ Tensor linear(const Tensor& input, const Tensor& weight, const std::optional<Ten
    // Fused op is marginally faster.
    return at::addmm(*bias, input, weight.t());
  }
-  if (bias->defined() && !input.is_xla()) {
-    // Also hit the fused path for contiguous 3D input, if not using xla
+
+  const auto is_bias_likely_fusable = (
+      bias->defined() &&
+      // cuBLASLt: will fuse in the epilogue without copies
+      // when input/weight/bias are all strided.
+      // When weight is not strided, bias will not be fused,
+      // but we can still dispatch here to avoid at::matmul
+      // path which will probably use a very similar
+      // flattening optimization.
+      ((bias->dim() == 1 || bias->squeeze().dim() == 1) && bias->is_contiguous_or_false())
+  );
+  if (is_bias_likely_fusable && !input.is_xla()) {
+    // Also hit the fused path for contiguous nD input, if not using xla
    // backend. Reshaping/flattening has some performance implications on xla.
-    bool is_contiguous = input.is_contiguous_or_false();
-    if (is_contiguous && input_dim == 3) {
+    if (input.is_contiguous_or_false()) {
      return _flatten_nd_linear(input, weight, *bias);
-    } else if (is_contiguous && input.layout() == c10::kStrided && weight.layout() == c10::kStrided && bias->dim() == 1) {
-      return _flatten_nd_linear(input, weight, *bias);
-    } else if (parseLinearFlatten3d() && input_dim == 3) {
+    } else if (parseLinearFlatten3d()) {
      // If user forces flattening via env var
      const Tensor input_cont = input.contiguous();
      return _flatten_nd_linear(input_cont, weight, *bias);
--- a/aten/src/ATen/native/cpu/Reduce.h
+++ b/aten/src/ATen/native/cpu/Reduce.h
@ -247,8 +247,8 @@ void binary_kernel_reduce(TensorIteratorBase& iter, ops_t ops, init_t init) {
  });
 }

-template <typename func_t, typename vec_func_t, typename ident_t = double>
-void binary_kernel_reduce_vec(TensorIteratorBase& iter, func_t op, vec_func_t vop, ident_t ident = static_cast<ident_t>(0)) {
+template <typename func_t, typename vec_func_t>
+void binary_kernel_reduce_vec(TensorIteratorBase& iter, func_t op, vec_func_t vop, double ident = 0) {
  using traits = binary_function_traits<func_t>;
  static_assert(
    all_same<
--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@ -339,13 +339,33 @@ void or_kernel_impl(TensorIterator& iter) {
  }
 }

+template<typename scalar_t>
+struct MinValuesOps: public at::native::MinOps<scalar_t> {
+  using arg_t = typename MinOps<scalar_t>::arg_t;
+  static scalar_t project(arg_t arg) {
+    return arg.first;
+  }
+};
+
 void min_values_kernel_impl(TensorIterator& iter) {
+  // This case is special because of Vectorized<int64_t> does not
+  // handle upper_bound<int64_t>().
+  // See: https://github.com/pytorch/pytorch/issues/43254
+  if (iter.dtype() == kLong || iter.dtype() == kUInt64) {
+    AT_DISPATCH_V2(iter.dtype(), "min_values_cpu", AT_WRAP([&iter] {
+      binary_kernel_reduce(
+        iter,
+        MinValuesOps<scalar_t>{},
+        std::pair<scalar_t, int64_t>(upper_bound<scalar_t>(), -1));
+    }), kLong, kUInt64);
+    return;
+  }
  AT_DISPATCH_V2(iter.dtype(), "min_values_cpu", AT_WRAP([&iter] {
    binary_kernel_reduce_vec(
      iter,
      [](scalar_t a, scalar_t b) -> scalar_t { return min_impl(a, b); },
      [](Vectorized<scalar_t> a, Vectorized<scalar_t> b) { return minimum(a, b); },
-      upper_bound<scalar_t>());
+      static_cast<double>(upper_bound<scalar_t>()));
  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
 }

--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -147,14 +147,24 @@ static bool isGloballyDisabledAddmmCudaLt(const at::Device& device) {
 /*
 * Check whether for the given input we want to enable the Lt interface
 */
-static bool isInputCompliesAddmmCudaLt(Tensor& result, const Tensor& self, const Tensor& mat1, const Tensor& mat2, const Scalar& beta, const Scalar& alpha) {
+static bool isInputCompliesAddmmCudaLt(
+    Tensor& result,
+    const Tensor& self,
+    const Tensor& mat1,
+    const Tensor& mat2,
+    const Scalar& beta,
+    const Scalar& alpha,
+    Activation activation
+) {
+  #ifdef USE_ROCM
  // Implies 2D bias which we currently not send through Lt.
  // TODO: this check is done pre col-major input preparation,
  // so, this condition can be ralexed in cases when a col-major
  // copy of result is needed.
-  if (result.is_same(self)) {
+  if (self.is_same(result) || self.dim() == 2) {
    return false;
  }
+  #endif

  #if defined(USE_ROCM) && ROCM_VERSION == 60400
  // hipblaslt TT fp32 regression on ROCm 6.4, cannot use
@ -169,13 +179,33 @@ static bool isInputCompliesAddmmCudaLt(Tensor& result, const Tensor& self, const
  #if defined(CUDA_VERSION) || defined(USE_ROCM)
  const auto scalar_type = mat1.scalar_type();
  return (beta.toComplexDouble() == 1.0
+    // NOTE: row-major result is important when bias is 1D.
+    // This is because Lt broadcasts 1D bias over the columns
+    // while the aten::addmm API broadcasts it over the rows,
+    // and this is in conjuction with the data preparation
+    // procedure that does not transpose arguments with
+    // col-major result. For col-major result we need
+    // to explicitly transpose the problem so that bias is
+    // correctly applied.
+    // TODO: enable col-major result if needed.
+    // TODO: no need to check result's layout when
+    // !result.is_same(self) and self.dim() == 2, because
+    // self needs to be copied into result and the bias ptr
+    // will be ignored.
    && result.dim() == 2 && result.is_contiguous()
-    // Conditions for bias to be fusable
    && (
-      self.is_contiguous() &&
-      // NOTE: fine to have 1-len dims to the left from the right-most one
-      (self.dim() == 1 || self.squeeze().dim() == 1) &&
-      self.sizes().back() == mat2_sizes[1]
+      ( // Conditions for bias to be fusable -- implies direct Lt path without copies.
+        self.is_contiguous() &&
+        // NOTE: fine to have 1-len dims to the left from the right-most one
+        (self.dim() == 1 || self.squeeze().dim() == 1) &&
+        self.sizes().back() == mat2_sizes[1]
+      )
+      || ( // 2D bias restrictions. self.is_contiguous() is implicit when result.is_same(self),
+        // and we need to copy self into result otherwise, so the self's layout becomes irrelevant.
+        // See also TODO from above.
+        activation != Activation::None && // Lt is faster when activation is fused
+        (self.dim() == 2 && at::is_expandable_to(self.sizes(), {mat1_sizes[0], mat2_sizes[1]}))
+      )
    )
    && ( // some dtype restrictions
      #ifndef USE_ROCM
@ -270,7 +300,16 @@ bool launchGemmAndBiasCublasLt(
    const Scalar& alpha,
    Activation activation = Activation::None
 ) {
-  const auto* self_ptr = self.const_data_ptr<scalar_t>();
+  // We apply bias in the epilogue only when it is 1D,
+  // or when it can be squeezed to 1D.
+  // self_ptr == nullptr implies ignore bias epilogue
+  // and use standard gemm-like API.
+  const auto* self_ptr = [&]() -> auto {
+    if (self.dim() == 1 || self.squeeze().dim() == 1) {
+      return self.const_data_ptr<scalar_t>();
+    }
+    return static_cast<const scalar_t*>(nullptr);
+  }();

  const auto tuning_ctx = at::cuda::tunable::getTuningContext();
  if (tuning_ctx->IsTunableOpEnabled()) {
@ -356,7 +395,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
  disable_addmm_cuda_lt = isGloballyDisabledAddmmCudaLt(self.device()) || disable_addmm_cuda_lt;
  #endif
  // Condition on the input
-  disable_addmm_cuda_lt = !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha) || disable_addmm_cuda_lt;
+  disable_addmm_cuda_lt = !isInputCompliesAddmmCudaLt(result, self, mat1, mat2, beta, alpha, activation) || disable_addmm_cuda_lt;
  // }

  at::ScalarType scalar_type = mat1.scalar_type();
@ -366,19 +405,20 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
  if (!result.is_same(self)) {
    at::native::resize_output(result, {mat1.sizes()[0], mat2.sizes()[1]});

+    // We use bias ptr in the Lt path only when bias is 1D
+    const auto use_bias_ptr_lt = (self.dim() == 1) && !disable_addmm_cuda_lt;
    const auto self_maybe_expanded = [&]() -> c10::MaybeOwned<Tensor> {
-      if (disable_addmm_cuda_lt) {
-        // When in non-Lt path we do expand self even before
+      if (!use_bias_ptr_lt) {
+        // We do expand self even before
        // check for beta != 0.0 to make sure that
        // test_sparse_csr.py::TestSparseCSRCUDA::test_addmm_errors_*
        // runs green.
        return expand_size(self, result.sizes(), "addmm");
      }
-      // copy next, should broadcast
      return c10::MaybeOwned<Tensor>::borrowed(self);
    }();
-    // We copy bias when in the non-Lt path
-    if (beta.toComplexDouble() != 0.0 && disable_addmm_cuda_lt) {
+    // We do not copy bias only when we need the bias ptr
+    if (beta.toComplexDouble() != 0.0 && !use_bias_ptr_lt) {
      // NOTE: self should broadcast over result
      at::native::copy_(result, *self_maybe_expanded);
    }
--- a/aten/src/ATen/native/cuda/CUDALoops.cuh
+++ b/aten/src/ATen/native/cuda/CUDALoops.cuh
@ -884,6 +884,69 @@ struct type_specialized_kernel_launcher {
  }
 };

+template <int arg_index>
+struct type_specialized_broadcast_kernel_launcher {
+  template <
+      typename func_t,
+      typename array_t,
+      typename dtypes_t,
+      typename calc_t>
+  static void apply(
+      int64_t numel,
+      func_t f,
+      array_t data,
+      dtypes_t dtypes,
+      calc_t offset_calc) {
+        using traits = function_traits<func_t>;
+        using ret_t = typename traits::result_type;
+        using arg0_t = typename traits::template arg<0>::type;
+        using arg1_t = typename traits::template arg<1>::type;
+        if (dtypes[0] == rt_binary_specializations[arg_index][0] &&
+          dtypes[1] == rt_binary_specializations[arg_index][1] &&
+          dtypes[2] == rt_binary_specializations[arg_index][2]) {
+            using ret_cpp_t = c10::impl::ScalarTypeToCPPTypeT<rt_binary_specializations[arg_index][0]>;
+            using arg0_cpp_t = c10::impl::ScalarTypeToCPPTypeT<rt_binary_specializations[arg_index][1]>;
+            using arg1_cpp_t = c10::impl::ScalarTypeToCPPTypeT<rt_binary_specializations[arg_index][2]>;
+            constexpr int grp_sz = 128;
+            launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
+              if (unrl) {
+                auto offsets0 = offset_calc.get(idx);
+                auto offsets1 = offset_calc.get(idx + grp_sz);
+                auto offsets2 = offset_calc.get(idx + grp_sz * 2);
+                auto offsets3 = offset_calc.get(idx + grp_sz * 3);
+                void* out0 = data[0] + offsets0[0];
+                void* out1 = data[0] + offsets1[0];
+                void* out2 = data[0] + offsets2[0];
+                void* out3 = data[0] + offsets3[0];
+                auto u = c10::load<arg0_cpp_t>(data[1] + offsets0[1]);
+                auto v = c10::load<arg1_cpp_t>(data[2] + offsets0[2]);
+                ret_t result0 = f(c10::convert<arg0_t>(u), c10::convert<arg1_t>(v));
+                auto u1 = c10::load<arg0_cpp_t>(data[1] + offsets1[1]);
+                auto v1 = c10::load<arg1_cpp_t>(data[2]+ offsets1[2]);
+                ret_t result1 = f(c10::convert<arg0_t>(u1), c10::convert<arg1_t>(v1));
+                auto u2 = c10::load<arg0_cpp_t>(data[1] + offsets2[1]);
+                auto v2 = c10::load<arg1_cpp_t>(data[2] + offsets2[2]);
+                ret_t result2 = f(c10::convert<arg0_t>(u2), c10::convert<arg1_t>(v2));
+                auto u3 = c10::load<arg0_cpp_t>(data[1] + offsets3[1]);
+                auto v3 = c10::load<arg1_cpp_t>(data[2] + offsets3[2]);
+                ret_t result3 = f(c10::convert<arg0_t>(u3), c10::convert<arg1_t>(v3));
+                *(ret_cpp_t*)out0 = c10::convert<ret_cpp_t>(result0);
+                *(ret_cpp_t*)out1 = c10::convert<ret_cpp_t>(result1);
+                *(ret_cpp_t*)out2 = c10::convert<ret_cpp_t>(result2);
+                *(ret_cpp_t*)out3 = c10::convert<ret_cpp_t>(result3);
+              } else {
+                auto offsets = offset_calc.get(idx);
+                void* out = data[0] + offsets[0];
+                auto u = c10::load<arg0_cpp_t>(data[1] + offsets[1]);
+                auto v = c10::load<arg1_cpp_t>(data[2] + offsets[2]);
+                ret_t result = f(c10::convert<arg0_t>(u), c10::convert<arg1_t>(v));
+                *(ret_cpp_t*)out = c10::convert<ret_cpp_t>(result);
+              }
+            });
+        }
+      }
+};
+
 } // namespace
 #endif

@ -1002,6 +1065,32 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
    }
    auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter);
 #ifdef USE_ROCM
+    if (check_binary_rt_types_for_specialization(iter)) {
+      // constexpr to reduce the amount of kernels generated for
+      // broadcast elementwise with mexed dtypes and limit which functors are actually
+      // applied to the load and store at compile time.
+      using func_tuple = typename traits::ArgsTuple;
+      if constexpr (
+        std::is_same_v<float, arg0_t> && traits::arity == 2 &&
+        check_binary_functor_types_for_specialization<
+          func_tuple,
+          float,
+          float,
+          traits::arity,
+          /*arg_num=*/0>::check()) {
+            memory::detail::static_unroll<
+              type_specialized_broadcast_kernel_launcher,
+              rt_binary_specializations.size()>::with_args(
+                numel,
+                f,
+                data,
+                dtypes,
+                offset_calc
+            );
+            return;
+      }
+    }
+
    constexpr int grp_sz = 128;
    launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
      if (unrl) {
--- a/aten/src/ATen/native/mkldnn/xpu/detail/QConv.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/QConv.cpp
@ -133,7 +133,7 @@ at::Tensor quantized_convolution(
  // supported in conv.
  mask_weight = weight_zero_points.numel() > 1 ? 1 : 0;
  if (groups > 1 && weight_zero_points.numel() > 1)
-    mask_weight = (2 ^ 0) | (2 ^ 1); // 2^0 (group) | 2^1 (output channel)
+    mask_weight = (1 << 0) | (1 << 1); // 2^0 (group) | 2^1 (output channel)
  dnnl::primitive_attr pattr;

  bool src_need_zp = (act_zero_point != 0);
--- a/aten/src/ATen/native/mps/operations/Blas.mm
+++ b/aten/src/ATen/native/mps/operations/Blas.mm
@ -141,6 +141,9 @@ static Tensor& addmv_out_mps_impl(const Tensor& self,
  };

  MPSStream* stream = at::mps::getCurrentMPSStream();
+  if (result.numel() == 0) {
+    return result;
+  }
  Tensor matMulVec = at::mm(mat, vec.unsqueeze(1)).squeeze(1);

  @autoreleasepool {
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -2803,7 +2803,7 @@
 - func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
  dispatch:
-    CPU, CUDA, MPS: floor_divide_out
+    CPU, CUDA, MPS, MTIA: floor_divide_out
    SparseCPU, SparseCUDA, SparseMPS: floor_divide_out_sparse_zerodim

 - func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
@ -478,7 +478,7 @@ bool check_cudnn_tensor_shapes(sdp_params const& params, bool debug) {
  const auto s_k = params.key.sym_size(2);
  const auto d_qk = params.query.sym_size(3);
  const auto d_v = params.value.sym_size(3);
-  long cudnn_version = at::detail::getCUDAHooks().versionCuDNN();
+  long cudnn_version = at::detail::getCUDAHooks().versionRuntimeCuDNN();
  if (cudnn_version < 8903) {
    if (debug) {
      TORCH_WARN("SDPA fprop requires cudnn 8.9.3 or higher");
@ -709,7 +709,7 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
  return false;
 #endif
 #if defined(CUDNN_VERSION)
-  static auto cudnn_version = cudnnGetVersion();
+  static auto cudnn_version = at::detail::getCUDAHooks().versionRuntimeCuDNN();
  if (params.dropout > 0.0 && cudnn_version > 91100 && cudnn_version < 91400) {
    if (debug) {
      TORCH_WARN(CUDNN_VERSION, " cuDNN version does not support droppout in SDPA (9.11 - 9.13).");
--- a/c10/core/AutogradState.h
+++ b/c10/core/AutogradState.h
@ -1,6 +1,8 @@
 #pragma once

+#include <c10/core/SafePyObject.h>
 #include <c10/macros/Export.h>
+#include <optional>

 namespace c10 {

@ -15,7 +17,8 @@ struct C10_API AutogradState {
      bool inference_mode,
      bool fw_grad_mode,
      bool multithreading_enabled)
-      : grad_mode_(grad_mode),
+      : graph_exec_group_(std::nullopt),
+        grad_mode_(grad_mode),
        inference_mode_(inference_mode),
        fw_grad_mode_(fw_grad_mode),
        multithreading_enabled_(multithreading_enabled),
@ -41,6 +44,10 @@ struct C10_API AutogradState {
    view_replay_enabled_ = view_replay_enabled;
  }

+  void set_graph_exec_group(std::optional<SafePyObject> group) {
+    graph_exec_group_ = std::move(group);
+  }
+
  bool get_grad_mode() const {
    return grad_mode_;
  }
@ -61,7 +68,12 @@ struct C10_API AutogradState {
    return view_replay_enabled_;
  }

+  const std::optional<SafePyObject>& get_graph_exec_group() const {
+    return graph_exec_group_;
+  }
+
 private:
+  std::optional<SafePyObject> graph_exec_group_;
  bool grad_mode_ : 1;
  bool inference_mode_ : 1;
  bool fw_grad_mode_ : 1;
--- a/c10/cuda/CUDAAllocatorConfig.cpp
+++ b/c10/cuda/CUDAAllocatorConfig.cpp
@ -106,6 +106,9 @@ void CUDAAllocatorConfig::parseArgs(const std::string& env) {
    } else if (key == "graph_capture_record_stream_reuse") {
      i = parseGraphCaptureRecordStreamReuse(tokenizer, i);
      used_native_specific_option = true;
+    } else if (key == "per_process_memory_fraction") {
+      i = parsePerProcessMemoryFraction(tokenizer, i);
+      used_native_specific_option = true;
    } else {
      const auto& keys =
          c10::CachingAllocator::AcceleratorAllocatorConfig::getKeys();
@ -146,6 +149,18 @@ size_t CUDAAllocatorConfig::parseGraphCaptureRecordStreamReuse(
  return i;
 }

+double CUDAAllocatorConfig::parsePerProcessMemoryFraction(
+    const c10::CachingAllocator::ConfigTokenizer& tokenizer,
+    size_t i) {
+  tokenizer.checkToken(++i, ":");
+  double val_env = tokenizer.toDouble(++i);
+  TORCH_CHECK_VALUE(
+      val_env >= 0.0 && val_env <= 1.0,
+      "per_process_memory_fraction is invalid, set it in [0.0, 1.0]");
+  m_per_process_memory_fraction = val_env;
+  return i;
+}
+
 size_t CUDAAllocatorConfig::parsePinnedNumRegisterThreads(
    const c10::CachingAllocator::ConfigTokenizer& tokenizer,
    size_t i) {
--- a/c10/cuda/CUDAAllocatorConfig.h
+++ b/c10/cuda/CUDAAllocatorConfig.h
@ -61,6 +61,10 @@ class C10_CUDA_API CUDAAllocatorConfig {
    return instance().m_graph_capture_record_stream_reuse;
  }

+  static double per_process_memory_fraction() {
+    return instance().m_per_process_memory_fraction;
+  }
+
  /** Pinned memory allocator settings */
  static bool pinned_use_cuda_host_register() {
    return instance().m_pinned_use_cuda_host_register;
@ -152,7 +156,8 @@ class C10_CUDA_API CUDAAllocatorConfig {
        "pinned_use_hip_host_register",
        "graph_capture_record_stream_reuse",
        "pinned_reserve_segment_size_mb",
-        "pinned_num_register_threads"};
+        "pinned_num_register_threads",
+        "per_process_memory_fraction"};
    return keys;
  }

@ -177,6 +182,9 @@ class C10_CUDA_API CUDAAllocatorConfig {
  size_t parseGraphCaptureRecordStreamReuse(
      const c10::CachingAllocator::ConfigTokenizer& tokenizer,
      size_t i);
+  double parsePerProcessMemoryFraction(
+      const c10::CachingAllocator::ConfigTokenizer& tokenizer,
+      size_t i);

  std::atomic<size_t> m_pinned_num_register_threads{1};
  std::atomic<size_t> m_pinned_reserve_segment_size_mb{0};
@ -189,6 +197,7 @@ class C10_CUDA_API CUDAAllocatorConfig {
  std::atomic<bool> m_release_lock_on_cudamalloc{false};
  std::atomic<bool> m_pinned_use_cuda_host_register{false};
  std::atomic<bool> m_graph_capture_record_stream_reuse{false};
+  std::atomic<double> m_per_process_memory_fraction{1.0};
 };

 // Keep this for backwards compatibility
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@ -1100,7 +1100,7 @@ class RingBuffer {
 } // anonymous namespace
 } // namespace Native

-static std::string reportProcessMemoryInfo(c10::DeviceIndex device) {
+static std::string reportProcessMemoryInfo(const cudaDeviceProp& prop) {
 #ifdef PYTORCH_C10_DRIVER_API_SUPPORTED
  void* nvml_handle = DriverAPI::get_nvml_handle();
  if (!nvml_handle) {
@ -1111,9 +1111,6 @@ static std::string reportProcessMemoryInfo(c10::DeviceIndex device) {
    return true;
  }();

-  cudaDeviceProp prop{};
-  C10_CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
-
  // NOLINTNEXTLINE(*-c-arrays)
  char pci_id[80];
  snprintf(
@ -1215,14 +1212,16 @@ class DeviceCachingAllocator {
  // record used memory.
  size_t total_allocated_memory = 0;

-  size_t allowed_memory_maximum = 0;
+  cudaDeviceProp device_prop;
+
+  // maximum amount of memory that device is allowed to
+  // allocate. This is set iff memory fraction is less than 1
+  std::optional<size_t> allowed_memory_maximum{std::nullopt};

  // all live expandable segments
  std::vector<ExpandableSegment*> expandable_segments_;
  std::vector<c10::DeviceIndex> devices_with_peer_access_;

-  bool set_fraction = false;
-
  bool record_history = false;

  std::atomic<CreateContextFn> context_recorder_;
@ -1264,6 +1263,9 @@ class DeviceCachingAllocator {
      : device_id(id),
        large_blocks(/*small=*/false),
        small_blocks(/*small=*/true) {
+    C10_CUDA_CHECK(cudaGetDeviceProperties(&device_prop, id));
+
+    setMemoryFraction(CUDAAllocatorConfig::per_process_memory_fraction());
    stats.max_split_size =
        static_cast<int64_t>(AcceleratorAllocatorConfig::max_split_size());
    context_recorder_.store(nullptr);
@ -1399,7 +1401,7 @@ class DeviceCachingAllocator {
    if (!block_found) {
      // Do garbage collection if the flag is set.
      if (C10_UNLIKELY(
-              set_fraction &&
+              allowed_memory_maximum.has_value() &&
              AcceleratorAllocatorConfig::garbage_collection_threshold() >
                  0.0)) {
        garbage_collect_cached_blocks(context);
@ -1456,11 +1458,12 @@ class DeviceCachingAllocator {
      C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
      std::string allowed_info;

-      if (set_fraction) {
-        allowed_info = format_size(allowed_memory_maximum) + " allowed; ";
+      if (allowed_memory_maximum.has_value()) {
+        allowed_info =
+            format_size(allowed_memory_maximum.value()) + " allowed; ";
      }

-      std::string proc_info = reportProcessMemoryInfo(device_id);
+      std::string proc_info = reportProcessMemoryInfo(device_prop);

      record_trace(
          TraceEntry::OOM,
@ -1518,7 +1521,7 @@ class DeviceCachingAllocator {
      for (const auto& obs : observers_local) {
        obs(device_id,
            alloc_size,
-            set_fraction ? allowed_memory_maximum : device_total,
+            allowed_memory_maximum.value_or(device_total),
            device_free);
      }

@ -2015,25 +2018,26 @@ class DeviceCachingAllocator {

  /** get memory fraction limiting maximum allocated memory **/
  double getMemoryFraction() {
-    if (!set_fraction) {
+    if (!allowed_memory_maximum.has_value()) {
      return 1.0;
    }

-    size_t device_free = 0;
-    size_t device_total = 0;
-    C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
-    return static_cast<double>(allowed_memory_maximum) /
-        static_cast<double>(device_total);
+    return static_cast<double>(allowed_memory_maximum.value()) /
+        static_cast<double>(device_prop.totalGlobalMem);
  }

  /** set memory fraction to limit maximum allocated memory **/
  void setMemoryFraction(double fraction) {
-    size_t device_free = 0;
-    size_t device_total = 0;
-    C10_CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
-    allowed_memory_maximum =
-        static_cast<size_t>(fraction * static_cast<double>(device_total));
-    set_fraction = true;
+    TORCH_CHECK(
+        0 <= fraction && fraction <= 1,
+        "invalid fraction:",
+        fraction,
+        ". Please set within [0, 1].");
+    allowed_memory_maximum = std::nullopt;
+    if (fraction < 1.0) {
+      allowed_memory_maximum = static_cast<size_t>(
+          fraction * static_cast<double>(device_prop.totalGlobalMem));
+    }
  }

  /** get expandable segment size for all the streams on device **/
@ -3010,7 +3014,7 @@ class DeviceCachingAllocator {
    BlockPool& pool = *p.pool;

    if (C10_UNLIKELY(
-            set_fraction &&
+            allowed_memory_maximum.has_value() &&
            AcceleratorAllocatorConfig::garbage_collection_threshold() > 0.0)) {
      // Track block reuse interval only when garbage collection is enabled.
      ++pool.get_free_blocks_call_count;
@ -3083,7 +3087,7 @@ class DeviceCachingAllocator {

    size_t gc_threshold = static_cast<size_t>(
        AcceleratorAllocatorConfig::garbage_collection_threshold() *
-        static_cast<double>(allowed_memory_maximum));
+        static_cast<double>(allowed_memory_maximum.value()));
    // No need to trigger GC yet
    if (total_allocated_memory <= gc_threshold) {
      return;
@ -3161,8 +3165,8 @@ class DeviceCachingAllocator {

    bool active_pool =
        p.pool->owner_PrivatePool && p.pool->owner_PrivatePool->allocator();
-    if (set_fraction &&
-        total_allocated_memory + size > allowed_memory_maximum) {
+    if (allowed_memory_maximum.has_value() &&
+        total_allocated_memory + size > allowed_memory_maximum.value()) {
      p.err = cudaErrorMemoryAllocation;
      return false;
      // Temporarily disable checkpointing & cudagraphs internally
@ -3859,7 +3863,6 @@ class NativeCachingAllocator : public CUDAAllocator {
        "Allocator not initialized for device ",
        device,
        ": did you call init?");
-    C10_CUDA_CHECK(c10::cuda::SetDevice(device));
    return device_allocator[device]->getMemoryFraction();
  }

@ -3869,12 +3872,6 @@ class NativeCachingAllocator : public CUDAAllocator {
        "Allocator not initialized for device ",
        device,
        ": did you call init?");
-    TORCH_CHECK(
-        0 <= fraction && fraction <= 1,
-        "invalid fraction:",
-        fraction,
-        ". Please set within [0, 1].");
-    C10_CUDA_CHECK(c10::cuda::SetDevice(device));
    device_allocator[device]->setMemoryFraction(fraction);
  }

--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@ -2,6 +2,7 @@

 #include <c10/core/AllocatorConfig.h>
 #include <c10/core/CachingDeviceAllocator.h>
+#include <c10/cuda/CUDAAllocatorConfig.h>
 #include <c10/cuda/CUDAGraphsC10Utils.h>
 #include <c10/cuda/CUDAMacros.h>
 #include <c10/cuda/CUDAStream.h>
--- a/c10/cuda/CUDAMallocAsyncAllocator.cpp
+++ b/c10/cuda/CUDAMallocAsyncAllocator.cpp
@ -427,7 +427,6 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
  // on the current device each later call sees.
  void init(int dev_count) override {
    static bool called = [](int dev_count) {
-      ;
      // Are there external guarantees init will be called before
      // any of the allocator's other functions?
      // std::lock_guard<std::mutex> lk(general_mutex);
--- a/c10/test/build.bzl
+++ b/c10/test/build.bzl
@ -66,6 +66,15 @@ def define_targets(rules):
        ],
    )

+    rules.cc_test(
+        name = "util/nofatal_test",
+        srcs = ["util/nofatal_test.cpp"],
+        deps = [
+            "//c10/util:base",
+            "@com_google_googletest//:gtest_main",
+        ],
+    )
+
    rules.cc_test(
        name = "util/ssize_test",
        srcs = ["util/ssize_test.cpp"],
--- a/c10/test/util/nofatal_test.cpp
+++ b/c10/test/util/nofatal_test.cpp
@ -0,0 +1,53 @@
+#include <gtest/gtest.h>
+
+#include <c10/util/Exception.h>
+#include <c10/util/Logging.h>
+
+namespace {
+template <typename T>
+inline void expectThrowsEq(T&& fn, const char* expected_msg) {
+  try {
+    std::forward<T>(fn)();
+  } catch (const c10::Error& e) {
+    EXPECT_TRUE(
+        std::string(e.what_without_backtrace()).find(expected_msg) !=
+        std::string::npos);
+    return;
+  }
+  ADD_FAILURE() << "Expected to throw exception with message \"" << expected_msg
+                << "\" but didn't throw";
+}
+} // namespace
+
+TEST(NofatalTest, TorchCheckComparisons) {
+  // quick make sure that no-op works as expected
+  TORCH_CHECK_EQ(1, 1) << "i am a silly message " << 1;
+  expectThrowsEq(
+      []() { TORCH_CHECK_EQ(1, 2) << "i am a silly message " << 1; },
+      "Check failed: 1 == 2 (1 vs. 2). i am a silly message 1");
+  expectThrowsEq(
+      []() { TORCH_CHECK_NE(2, 2); }, "Check failed: 2 != 2 (2 vs. 2).");
+  expectThrowsEq(
+      []() { TORCH_CHECK_LT(2, 2); }, "Check failed: 2 < 2 (2 vs. 2).");
+  expectThrowsEq(
+      []() { TORCH_CHECK_LE(3, 2); }, "Check failed: 3 <= 2 (3 vs. 2).");
+  expectThrowsEq(
+      []() { TORCH_CHECK_GT(2, 2); }, "Check failed: 2 > 2 (2 vs. 2).");
+  expectThrowsEq(
+      []() { TORCH_CHECK_GE(2, 3); }, "Check failed: 2 >= 3 (2 vs. 3).");
+  expectThrowsEq(
+      []() {
+        void* p = nullptr;
+        TORCH_CHECK_NOTNULL(p);
+      },
+      "Check failed: 'p' must be non NULL.");
+
+#if GTEST_HAS_DEATH_TEST
+#ifndef NDEBUG
+  // if dbg build, DCHECK should result in deth
+  EXPECT_DEATH(TORCH_DCHECK_EQ(1, 2), "Check failed");
+#else
+  TORCH_DCHECK_EQ(1, 2); // no-op
+#endif
+#endif // GTEST_HAS_DEATH_TEST
+}
--- a/c10/util/Exception.h
+++ b/c10/util/Exception.h
@ -702,6 +702,98 @@ namespace c10::detail {
 #define TORCH_CHECK_ARG(cond, argN, ...) \
  TORCH_CHECK(cond, "invalid argument ", argN, ": ", __VA_ARGS__)

+#ifndef FATAL_IF
+#ifdef C10_USE_GLOG
+#define FATAL_IF(condition)                                              \
+  condition ? (void)0                                                    \
+            : ::c10::LoggerVoidify() &                                   \
+          ::c10::MessageLogger(__FILE__, __LINE__, ::google::GLOG_FATAL) \
+              .stream()
+#else
+#define FATAL_IF(condition)            \
+  condition ? (void)0                  \
+            : ::c10::LoggerVoidify() & \
+          ::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_FATAL).stream()
+#endif
+#endif
+
+#ifndef NON_FATAL_IF
+#ifdef C10_USE_GLOG
+#define NON_FATAL_IF(condition)                                \
+  condition ? (void)0                                          \
+            : ::c10::LoggerVoidify() &                         \
+          ::c10::MessageLogger(                                \
+              __FILE__, __LINE__, ::google::GLOG_FATAL, false) \
+              .stream()
+#else
+#define NON_FATAL_IF(condition)                                              \
+  condition ? (void)0                                                        \
+            : ::c10::LoggerVoidify() &                                       \
+          ::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_FATAL, false) \
+              .stream()
+#endif
+#endif
+
+// Binary comparison check macros
+#define TORCH_CHECK_OP(val1, val2, op)                                      \
+  NON_FATAL_IF(((val1)op(val2)))                                            \
+      << "Check failed: " #val1 " " #op " " #val2 " (" << (val1) << " vs. " \
+      << (val2) << "). "
+
+#define TORCH_DCHECK_OP(val1, val2, op)                                       \
+  FATAL_IF(((val1)op(val2))) << "Check failed: " #val1 " " #op " " #val2 " (" \
+                             << (val1) << " vs. " << (val2) << "). "
+
+#define TORCH_CHECK_EQ(val1, val2) TORCH_CHECK_OP(val1, val2, ==)
+#define TORCH_CHECK_NE(val1, val2) TORCH_CHECK_OP(val1, val2, !=)
+#define TORCH_CHECK_LE(val1, val2) TORCH_CHECK_OP(val1, val2, <=)
+#define TORCH_CHECK_LT(val1, val2) TORCH_CHECK_OP(val1, val2, <)
+#define TORCH_CHECK_GE(val1, val2) TORCH_CHECK_OP(val1, val2, >=)
+#define TORCH_CHECK_GT(val1, val2) TORCH_CHECK_OP(val1, val2, >)
+
+// Debug versions of TORCH_CHECK_OP macros
+#ifndef NDEBUG
+#define TORCH_DCHECK_EQ(val1, val2) TORCH_DCHECK_OP(val1, val2, ==)
+#define TORCH_DCHECK_NE(val1, val2) TORCH_DCHECK_OP(val1, val2, !=)
+#define TORCH_DCHECK_LE(val1, val2) TORCH_DCHECK_OP(val1, val2, <=)
+#define TORCH_DCHECK_LT(val1, val2) TORCH_DCHECK_OP(val1, val2, <)
+#define TORCH_DCHECK_GE(val1, val2) TORCH_DCHECK_OP(val1, val2, >=)
+#define TORCH_DCHECK_GT(val1, val2) TORCH_DCHECK_OP(val1, val2, >)
+#else // !NDEBUG
+// Optimized versions - generate no code
+#define TORCH_DCHECK_EQ(val1, val2) \
+  while (false)                     \
+  TORCH_DCHECK_OP(val1, val2, ==)
+#define TORCH_DCHECK_NE(val1, val2) \
+  while (false)                     \
+  TORCH_DCHECK_OP(val1, val2, !=)
+#define TORCH_DCHECK_LE(val1, val2) \
+  while (false)                     \
+  TORCH_DCHECK_OP(val1, val2, <=)
+#define TORCH_DCHECK_LT(val1, val2) \
+  while (false)                     \
+  TORCH_DCHECK_OP(val1, val2, <)
+#define TORCH_DCHECK_GE(val1, val2) \
+  while (false)                     \
+  TORCH_DCHECK_OP(val1, val2, >=)
+#define TORCH_DCHECK_GT(val1, val2) \
+  while (false)                     \
+  TORCH_DCHECK_OP(val1, val2, >)
+#endif // NDEBUG
+
+// Null pointer check macro
+#define TORCH_CHECK_NOTNULL(val) \
+  ::c10::CheckNotNull(__FILE__, __LINE__, #val, (val), false)
+
+#ifndef NDEBUG
+#define TORCH_DCHECK_NOTNULL(val) \
+  ::c10::CheckNotNull(__FILE__, __LINE__, #val, (val), true)
+#else // !NDEBUG
+#define TORCH_DCHECK_NOTNULL(val) \
+  while (false)                   \
+  TORCH_CHECK_NOTNULL(val)
+#endif // NDEBUG
+
 // ----------------------------------------------------------------------------
 // Deprecated macros
 // ----------------------------------------------------------------------------
--- a/c10/util/Logging.cpp
+++ b/c10/util/Logging.cpp
@ -291,6 +291,32 @@ namespace c10 {
 using fLB::FLAGS_logtostderr;
 using fLI::FLAGS_minloglevel;
 using fLI::FLAGS_v;
+
+MessageLogger::MessageLogger(
+    const char* file,
+    int line,
+    int severity,
+    bool exit_on_fatal)
+    : stream_(), severity_(severity), exit_on_fatal_(exit_on_fatal) {}
+
+MessageLogger::~MessageLogger() noexcept(false) {
+  if (severity_ == ::google::GLOG_FATAL) {
+    DealWithFatal();
+  }
+}
+
+std::stringstream& MessageLogger::stream() {
+  return stream_;
+}
+
+void MessageLogger::DealWithFatal() {
+  if (exit_on_fatal_) {
+    LOG(FATAL) << stream_.str();
+  } else {
+    throw c10::Error(stream_.str(), nullptr, nullptr);
+  }
+}
+
 } // namespace c10

 C10_DEFINE_int(
@ -412,17 +438,16 @@ void ShowLogInfoToStderr() {
  FLAGS_caffe2_log_level = GLOG_INFO;
 }

-MessageLogger::MessageLogger(const char* file, int line, int severity)
-    : severity_(severity) {
+MessageLogger::MessageLogger(
+    const char* file,
+    int line,
+    int severity,
+    bool exit_on_fatal)
+    : severity_(severity), exit_on_fatal_(exit_on_fatal) {
  if (severity_ < FLAGS_caffe2_log_level) {
    // Nothing needs to be logged.
    return;
  }
-#ifdef ANDROID
-  tag_ = "native";
-#else // !ANDROID
-  tag_ = "";
-#endif // ANDROID

  time_t rawtime = 0;
  time(&rawtime);
@ -458,7 +483,7 @@ MessageLogger::MessageLogger(const char* file, int line, int severity)
 }

 // Output the contents of the stream to the proper channel on destruction.
-MessageLogger::~MessageLogger() {
+MessageLogger::~MessageLogger() noexcept(false) {
  if (severity_ < FLAGS_caffe2_log_level) {
    // Nothing needs to be logged.
    return;
@ -498,6 +523,18 @@ MessageLogger::~MessageLogger() {
  }
 }

+std::stringstream& MessageLogger::stream() {
+  return stream_;
+}
+
+void MessageLogger::DealWithFatal() {
+  if (exit_on_fatal_) {
+    abort();
+  } else {
+    throw c10::Error(stream_.str(), nullptr, nullptr);
+  }
+}
+
 } // namespace c10

 #endif // !C10_USE_GLOG
--- a/c10/util/logging_common.h
+++ b/c10/util/logging_common.h
@ -0,0 +1,74 @@
+#ifndef C10_UTIL_LOGGING_COMMON_H_
+#define C10_UTIL_LOGGING_COMMON_H_
+
+#include <c10/macros/Export.h>
+#include <sstream>
+
+namespace c10 {
+
+// MessageLogger that throws exceptions instead of aborting (glog version)
+// or logs and may abort (non-glog version).
+class C10_API MessageLogger {
+ public:
+  MessageLogger(
+      const char* file,
+      int line,
+      int severity,
+      bool exit_on_fatal = true);
+  ~MessageLogger() noexcept(false);
+
+  // Return the stream associated with the logger object.
+  std::stringstream& stream();
+
+ private:
+  // When there is a fatal log, and fatal == true, we abort
+  // otherwise, we throw.
+  void DealWithFatal();
+
+#if defined(ANDROID) && !defined(C10_USE_GLOG)
+  const char* tag_{"native"};
+#endif
+  std::stringstream stream_;
+  int severity_;
+  bool exit_on_fatal_;
+};
+
+// This class is used to explicitly ignore values in the conditional
+// logging macros. This avoids compiler warnings like "value computed
+// is not used" and "statement has no effect".
+class C10_API LoggerVoidify {
+ public:
+  LoggerVoidify() = default;
+  // This has to be an operator with a precedence lower than << but
+  // higher than ?:
+  void operator&(const std::ostream& s [[maybe_unused]]) {}
+};
+
+// Forward declarations for CheckNotNull functions
+template <typename T>
+T& CheckNotNullCommon(
+    const char* file,
+    int line,
+    const char* names,
+    T& t,
+    bool fatal = true);
+
+template <typename T>
+T* CheckNotNull(
+    const char* file,
+    int line,
+    const char* names,
+    T* t,
+    bool fatal = true);
+
+template <typename T>
+T& CheckNotNull(
+    const char* file,
+    int line,
+    const char* names,
+    T& t,
+    bool fatal = true);
+
+} // namespace c10
+
+#endif // C10_UTIL_LOGGING_COMMON_H_
--- a/c10/util/logging_is_google_glog.h
+++ b/c10/util/logging_is_google_glog.h
@ -47,57 +47,53 @@ INSTANTIATE_FOR_CONTAINER(set)

 #endif

+#include <c10/util/logging_common.h>
 #include <glog/logging.h>

-// Additional macros on top of glog
-#define TORCH_CHECK_EQ(val1, val2) CHECK_EQ(val1, val2)
-#define TORCH_CHECK_NE(val1, val2) CHECK_NE(val1, val2)
-#define TORCH_CHECK_LE(val1, val2) CHECK_LE(val1, val2)
-#define TORCH_CHECK_LT(val1, val2) CHECK_LT(val1, val2)
-#define TORCH_CHECK_GE(val1, val2) CHECK_GE(val1, val2)
-#define TORCH_CHECK_GT(val1, val2) CHECK_GT(val1, val2)
+namespace c10 {

-#ifndef NDEBUG
-#define TORCH_DCHECK_EQ(val1, val2) DCHECK_EQ(val1, val2)
-#define TORCH_DCHECK_NE(val1, val2) DCHECK_NE(val1, val2)
-#define TORCH_DCHECK_LE(val1, val2) DCHECK_LE(val1, val2)
-#define TORCH_DCHECK_LT(val1, val2) DCHECK_LT(val1, val2)
-#define TORCH_DCHECK_GE(val1, val2) DCHECK_GE(val1, val2)
-#define TORCH_DCHECK_GT(val1, val2) DCHECK_GT(val1, val2)
-#else // !NDEBUG
-// These versions generate no code in optimized mode.
-#define TORCH_DCHECK_EQ(val1, val2) \
-  while (false)                     \
-  DCHECK_EQ(val1, val2)
-#define TORCH_DCHECK_NE(val1, val2) \
-  while (false)                     \
-  DCHECK_NE(val1, val2)
-#define TORCH_DCHECK_LE(val1, val2) \
-  while (false)                     \
-  DCHECK_LE(val1, val2)
-#define TORCH_DCHECK_LT(val1, val2) \
-  while (false)                     \
-  DCHECK_LT(val1, val2)
-#define TORCH_DCHECK_GE(val1, val2) \
-  while (false)                     \
-  DCHECK_GE(val1, val2)
-#define TORCH_DCHECK_GT(val1, val2) \
-  while (false)                     \
-  DCHECK_GT(val1, val2)
-#endif // NDEBUG
+[[noreturn]] void ThrowEnforceNotMet(
+    const char* file,
+    const int line,
+    const char* condition,
+    const std::string& msg,
+    const void* caller);

-// Check that a pointer is not null.
-#define TORCH_CHECK_NOTNULL(val) CHECK_NOTNULL(val)
+template <typename T>
+T& CheckNotNullCommon(
+    const char* file,
+    int line,
+    const char* names,
+    T& t,
+    bool fatal) {
+  if (t == nullptr) {
+    MessageLogger(file, line, ::google::GLOG_FATAL, fatal).stream()
+        << "Check failed: '" << names << "' must be non NULL. ";
+  }
+  return t;
+}

-#ifndef NDEBUG
-// Debug only version of TORCH_CHECK_NOTNULL
-#define TORCH_DCHECK_NOTNULL(val) DCHECK_NOTNULL(val)
-#else // !NDEBUG
-// Optimized version - generates no code.
-#define TORCH_DCHECK_NOTNULL(val) \
-  while (false)                   \
-  DCHECK_NOTNULL(val)
-#endif // NDEBUG
+template <typename T>
+T* CheckNotNull(
+    const char* file,
+    int line,
+    const char* names,
+    T* t,
+    bool fatal) {
+  return CheckNotNullCommon(file, line, names, t, fatal);
+}
+
+template <typename T>
+T& CheckNotNull(
+    const char* file,
+    int line,
+    const char* names,
+    T& t,
+    bool fatal) {
+  return CheckNotNullCommon(file, line, names, t, fatal);
+}
+
+} // namespace c10

 // Log with source location information override (to be used in generic
 // warning/error handlers implemented as functions, not macros)
--- a/c10/util/logging_is_not_google_glog.h
+++ b/c10/util/logging_is_not_google_glog.h
@ -13,6 +13,7 @@
 #include <vector>

 #include <c10/util/Flags.h>
+#include <c10/util/logging_common.h>

 const char CAFFE2_SEVERITY_PREFIX[] = "FEWIV";

@ -24,61 +25,40 @@ const int GLOG_ERROR = 2;
 const int GLOG_WARNING = 1;
 const int GLOG_INFO = 0;

-class C10_API MessageLogger {
- public:
-  MessageLogger(const char* file, int line, int severity);
-  ~MessageLogger();
-  // Return the stream associated with the logger object.
-  std::stringstream& stream() {
-    return stream_;
-  }
-
- private:
-  // When there is a fatal log, we simply abort.
-  void DealWithFatal() {
-    abort();
-  }
-
-  const char* tag_;
-  std::stringstream stream_;
-  int severity_;
-};
-
-// This class is used to explicitly ignore values in the conditional
-// logging macros.  This avoids compiler warnings like "value computed
-// is not used" and "statement has no effect".
-class C10_API LoggerVoidify {
- public:
-  LoggerVoidify() = default;
-  // This has to be an operator with a precedence lower than << but
-  // higher than ?:
-  void operator&(const std::ostream& s [[maybe_unused]]) {}
-};
-
-// Log a message and terminate.
-template <class T>
-void LogMessageFatal(const char* file, int line, const T& message) {
-  MessageLogger(file, line, GLOG_FATAL).stream() << message;
-}
-
 // Helpers for TORCH_CHECK_NOTNULL(). Two are necessary to support both raw
 // pointers and smart pointers.
 template <typename T>
-T& CheckNotNullCommon(const char* file, int line, const char* names, T& t) {
+T& CheckNotNullCommon(
+    const char* file,
+    int line,
+    const char* names,
+    T& t,
+    bool fatal) {
  if (t == nullptr) {
-    LogMessageFatal(file, line, std::string(names));
+    MessageLogger(file, line, GLOG_FATAL, fatal).stream()
+        << "Check failed: '" << names << "' must be non NULL. ";
  }
  return t;
 }

 template <typename T>
-T* CheckNotNull(const char* file, int line, const char* names, T* t) {
-  return CheckNotNullCommon(file, line, names, t);
+T* CheckNotNull(
+    const char* file,
+    int line,
+    const char* names,
+    T* t,
+    bool fatal) {
+  return CheckNotNullCommon(file, line, names, t, fatal);
 }

 template <typename T>
-T& CheckNotNull(const char* file, int line, const char* names, T& t) {
-  return CheckNotNullCommon(file, line, names, t);
+T& CheckNotNull(
+    const char* file,
+    int line,
+    const char* names,
+    T& t,
+    bool fatal) {
+  return CheckNotNullCommon(file, line, names, t, fatal);
 }
 } // namespace c10

@ -136,65 +116,6 @@ static_assert(
          ::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_##n).stream()
 #endif // NDEBUG

-#define TORCH_CHECK_OP(val1, val2, op)                                        \
-  FATAL_IF(((val1)op(val2))) << "Check failed: " #val1 " " #op " " #val2 " (" \
-                             << (val1) << " vs. " << (val2) << ") "
-
-// TORCH_CHECK_OP macro definitions
-#define TORCH_CHECK_EQ(val1, val2) TORCH_CHECK_OP(val1, val2, ==)
-#define TORCH_CHECK_NE(val1, val2) TORCH_CHECK_OP(val1, val2, !=)
-#define TORCH_CHECK_LE(val1, val2) TORCH_CHECK_OP(val1, val2, <=)
-#define TORCH_CHECK_LT(val1, val2) TORCH_CHECK_OP(val1, val2, <)
-#define TORCH_CHECK_GE(val1, val2) TORCH_CHECK_OP(val1, val2, >=)
-#define TORCH_CHECK_GT(val1, val2) TORCH_CHECK_OP(val1, val2, >)
-
-#ifndef NDEBUG
-// Debug only versions of TORCH_CHECK_OP macros.
-#define TORCH_DCHECK_EQ(val1, val2) TORCH_CHECK_OP(val1, val2, ==)
-#define TORCH_DCHECK_NE(val1, val2) TORCH_CHECK_OP(val1, val2, !=)
-#define TORCH_DCHECK_LE(val1, val2) TORCH_CHECK_OP(val1, val2, <=)
-#define TORCH_DCHECK_LT(val1, val2) TORCH_CHECK_OP(val1, val2, <)
-#define TORCH_DCHECK_GE(val1, val2) TORCH_CHECK_OP(val1, val2, >=)
-#define TORCH_DCHECK_GT(val1, val2) TORCH_CHECK_OP(val1, val2, >)
-#else // !NDEBUG
-// These versions generate no code in optimized mode.
-#define TORCH_DCHECK_EQ(val1, val2) \
-  while (false)                     \
-  TORCH_CHECK_OP(val1, val2, ==)
-#define TORCH_DCHECK_NE(val1, val2) \
-  while (false)                     \
-  TORCH_CHECK_OP(val1, val2, !=)
-#define TORCH_DCHECK_LE(val1, val2) \
-  while (false)                     \
-  TORCH_CHECK_OP(val1, val2, <=)
-#define TORCH_DCHECK_LT(val1, val2) \
-  while (false)                     \
-  TORCH_CHECK_OP(val1, val2, <)
-#define TORCH_DCHECK_GE(val1, val2) \
-  while (false)                     \
-  TORCH_CHECK_OP(val1, val2, >=)
-#define TORCH_DCHECK_GT(val1, val2) \
-  while (false)                     \
-  TORCH_CHECK_OP(val1, val2, >)
-#endif // NDEBUG
-
-// Check that a pointer is not null.
-#define TORCH_CHECK_NOTNULL(val) \
-  ::c10::CheckNotNull(           \
-      __FILE__, __LINE__, "Check failed: '" #val "' Must be non NULL", (val))
-
-#ifndef NDEBUG
-// Debug only version of TORCH_CHECK_NOTNULL
-#define TORCH_DCHECK_NOTNULL(val) \
-  ::c10::CheckNotNull(            \
-      __FILE__, __LINE__, "Check failed: '" #val "' Must be non NULL", (val))
-#else // !NDEBUG
-// Optimized version - generates no code.
-#define TORCH_DCHECK_NOTNULL(val) \
-  while (false)                   \
-  TORCH_CHECK_NOTNULL(val)
-#endif // NDEBUG
-
 // ---------------------- Support for std objects --------------------------
 // These are adapted from glog to support a limited set of logging capability
 // for STL objects.
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -1941,6 +1941,7 @@ if(BUILD_TEST)
    foreach(test_src ${Caffe2_XPU_TEST_SRCS})
      get_filename_component(test_name ${test_src} NAME_WE)
      add_executable(${test_name} "${test_src}")
+      torch_compile_options(${test_name})
      target_link_libraries(${test_name} torch_library gtest_main)
      target_include_directories(${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
      target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
--- a/caffe2/perfkernels/batch_box_cox_vec.h
+++ b/caffe2/perfkernels/batch_box_cox_vec.h
@ -73,6 +73,19 @@ void box_cox_zero_lambda(
  }
 }

+template <typename T>
+at::vec::Vectorized<T> box_cox_nonzero_lambda_impl(
+    at::vec::Vectorized<T> data,
+    at::vec::Vectorized<T> lambda1,
+    at::vec::Vectorized<T> lambda2,
+    at::vec::Vectorized<T> k_eps) {
+  auto sum = data + lambda2;
+  auto max = at::vec::max(sum, k_eps);
+  auto lambda_over_1 = at::vec::fast_recieprocal(lambda1);
+  auto pow = max.pow(lambda1);
+  return at::vec::fmsub(pow, lambda_over_1, lambda_over_1);
+}
+
 template <typename T>
 void box_cox_nonzero_lambda(
    int64_t D,
@ -88,21 +101,18 @@ void box_cox_nonzero_lambda(
  auto k_eps_vec = Vec(k_eps);
  for(; j + VLEN < D; j += VLEN) {
    auto data = Vec::loadu(data_ptr + j);
-    auto lambda2 = Vec::loadu(lambda2_ptr + j);
-    auto sum = data + lambda2;
-    auto max = at::vec::max(sum, k_eps_vec);
    auto lambda1 = Vec::loadu(lambda1_ptr + j);
-    auto lambda_over_1 = at::vec::fast_recieprocal(lambda1);
-    auto pow = max.pow(lambda1);
-    auto res = at::vec::fmsub(pow, lambda_over_1, lambda_over_1);
+    auto lambda2 = Vec::loadu(lambda2_ptr + j);
+    auto res = box_cox_nonzero_lambda_impl(data, lambda1, lambda2, k_eps_vec);
    res.store(out + j);
  }
-  for ( ;j < D; ++j) {
-    auto sum = data_ptr[j] + lambda2_ptr[j];
-    auto max = std::max(sum, k_eps);
-    auto lambda_over_1 = at::vec::fast_recieprocal(lambda1_ptr[j]);
-    auto pow = std::pow(max, lambda1_ptr[j]);
-    out[j] = pow * lambda_over_1 - lambda_over_1;
+  if (j < D) {
+    auto remaining = D - j;
+    auto data = Vec::loadu(data_ptr + j, remaining);
+    auto lambda1 = Vec::loadu(lambda1_ptr + j, remaining);
+    auto lambda2 = Vec::loadu(lambda2_ptr + j, remaining);
+    auto res = box_cox_nonzero_lambda_impl(data, lambda1, lambda2, k_eps_vec);
+    res.store(out + j, remaining);
  }
 }
 #else
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -206,6 +206,41 @@ templates_path = [
    os.path.join(os.path.dirname(pytorch_sphinx_theme2.__file__), "templates"),
 ]
 # TODO: document these and remove them from here.
+# Fixes the duplicated
+autosummary_filename_map = {
+    "torch.nn.utils.prune.identity": "torch.nn.utils.prune.identity_function",
+    "torch.nn.utils.prune.Identity": "torch.nn.utils.prune.Identity_class",
+    "torch.optim.adamw.adamw": "torch.optim.adamw.adamw_function",
+    "torch.optim.adamw.AdamW": "torch.optim.adamw.AdamW_class",
+    "torch.optim.asgd.asgd": "torch.optim.asgd.asgd_function",
+    "torch.optim.asgd.ASGD": "torch.optim.asgd.ASGD_class",
+    "torch.optim.nadam.nadam": "torch.optim.nadam.nadam_function",
+    "torch.optim.nadam.NAdam": "torch.optim.nadam.NAdam_class",
+    "torch.optim.radam.radam": "torch.optim.radam.radam_function",
+    "torch.optim.radam.RAdam": "torch.optim.radam.RAdam_class",
+    "torch.optim.rmsprop.rmsprop": "torch.optim.rmsprop.rmsprop_function",
+    "torch.optim.rmsprop.RMSprop": "torch.optim.rmsprop.RMSprop_class",
+    "torch.optim.rprop.rprop": "torch.optim.rprop.rprop_function",
+    "torch.optim.rprop.Rprop": "torch.optim.rprop.Rprop_class",
+    "torch.optim.sgd.sgd": "torch.optim.sgd.sgd_function",
+    "torch.optim.sgd.SGD": "torch.optim.sgd.SGD_class",
+    "torch.optim.adadelta.adadelta": "torch.optim.adadelta.adadelta_function",
+    "torch.optim.adadelta.Adadelta": "torch.optim.adadelta.Adadelta_class",
+    "torch.optim.adagrad.adagrad": "torch.optim.adagrad.adagrad_function",
+    "torch.optim.adagrad.Adagrad": "torch.optim.adagrad.Adagrad_class",
+    "torch.optim.adam.adam": "torch.optim.adam.adam_function",
+    "torch.optim.adam.Adam": "torch.optim.adam.Adam_class",
+    "torch.optim.adamax.adamax": "torch.optim.adamax.adamax_function",
+    "torch.optim.adamax.Adamax": "torch.optim.adamax.Adamax_class",
+    "torch.mtia.stream": "torch.mtia.stream_function",
+    "torch.mtia.Stream": "torch.mtia.Stream_class",
+    "torch.cpu.stream": "torch.cpu.stream_function",
+    "torch.cpu.Stream": "torch.cpu.Stream_class",
+    "torch.cuda.stream": "torch.cuda.stream_function",
+    "torch.cuda.Stream": "torch.cuda.Stream_class",
+    "torch.xpu.stream": "torch.xpu.stream_function",
+    "torch.xpu.Stream": "torch.xpu.Stream_class",
+}

 coverage_ignore_functions = [
    # torch
@ -347,20 +382,6 @@ coverage_ignore_functions = [
    # torch.ao.quantization.backend_config.tensorrt
    "get_tensorrt_backend_config",
    "get_tensorrt_backend_config_dict",
-    # torch.ao.quantization.backend_config.utils
-    "entry_to_pretty_str",
-    "get_fused_module_classes",
-    "get_fuser_method_mapping",
-    "get_fusion_pattern_to_extra_inputs_getter",
-    "get_fusion_pattern_to_root_node_getter",
-    "get_module_to_qat_module",
-    "get_pattern_to_dtype_configs",
-    "get_pattern_to_input_type_to_index",
-    "get_qat_module_classes",
-    "get_root_module_to_quantized_reference_module",
-    "pattern_to_human_readable",
-    "remove_boolean_dispatch_from_name",
-    # torch.ao.quantization.backend_config.x86
    "get_x86_backend_config",
    # torch.ao.quantization.fuse_modules
    "fuse_known_modules",
@ -391,25 +412,6 @@ coverage_ignore_functions = [
    "insert_observers_for_model",
    "prepare",
    "propagate_dtypes_for_known_nodes",
-    # torch.ao.quantization.fx.utils
-    "all_node_args_except_first",
-    "all_node_args_have_no_tensors",
-    "assert_and_get_unique_device",
-    "collect_producer_nodes",
-    "create_getattr_from_value",
-    "create_node_from_old_node_preserve_meta",
-    "get_custom_module_class_keys",
-    "get_linear_prepack_op_for_dtype",
-    "get_new_attr_name_with_prefix",
-    "get_non_observable_arg_indexes_and_types",
-    "get_qconv_prepack_op",
-    "get_skipped_module_name_and_classes",
-    "graph_module_from_producer_nodes",
-    "maybe_get_next_module",
-    "node_arg_is_bias",
-    "node_arg_is_weight",
-    "return_arg_list",
-    # torch.ao.quantization.pt2e.graph_utils
    "bfs_trace_with_node_process",
    "find_sequential_partitions",
    "get_equivalent_types",
@ -825,80 +827,10 @@ coverage_ignore_functions = [
    "get_latency_of_one_partition",
    "get_latency_of_partitioned_graph",
    "get_partition_to_latency_mapping",
-    # torch.fx.experimental.proxy_tensor
-    "decompose",
-    "disable_autocast_cache",
-    "disable_proxy_modes_tracing",
-    "dispatch_trace",
-    "extract_val",
-    "fake_signature",
-    "fetch_sym_proxy",
-    "fetch_object_proxy",
-    "get_innermost_proxy_mode",
-    "get_isolated_graphmodule",
-    "get_proxy_slot",
-    "get_torch_dispatch_modes",
-    "has_proxy_slot",
-    "is_sym_node",
-    "maybe_handle_decomp",
-    "proxy_call",
-    "set_meta",
-    "set_original_aten_op",
-    "set_proxy_slot",
-    "snapshot_fake",
-    "thunkify",
-    "track_tensor",
-    "track_tensor_tree",
-    "wrap_key",
-    "wrapper_and_args_for_make_fx",
-    # torch.fx.experimental.recording
    "record_shapeenv_event",
    "replay_shape_env_events",
    "shape_env_check_state_equal",
-    # torch.fx.experimental.sym_node
-    "ceil_impl",
-    "floor_ceil_helper",
-    "floor_impl",
-    "method_to_operator",
-    "sympy_is_channels_last_contiguous_2d",
-    "sympy_is_channels_last_contiguous_3d",
-    "sympy_is_channels_last_strides_2d",
-    "sympy_is_channels_last_strides_3d",
-    "sympy_is_channels_last_strides_generic",
-    "sympy_is_contiguous",
-    "sympy_is_contiguous_generic",
-    "to_node",
-    "wrap_node",
    "sym_sqrt",
-    # torch.fx.experimental.symbolic_shapes
-    "bind_symbols",
-    "cast_symbool_to_symint_guardless",
-    "create_contiguous",
-    "error",
-    "eval_guards",
-    "eval_is_non_overlapping_and_dense",
-    "expect_true",
-    "find_symbol_binding_fx_nodes",
-    "free_symbols",
-    "free_unbacked_symbols",
-    "fx_placeholder_targets",
-    "fx_placeholder_vals",
-    "guard_bool",
-    "guard_float",
-    "guard_int",
-    "guard_scalar",
-    "has_hint",
-    "has_symbolic_sizes_strides",
-    "is_channels_last_contiguous_2d",
-    "is_channels_last_contiguous_3d",
-    "is_channels_last_strides_2d",
-    "is_channels_last_strides_3d",
-    "is_contiguous",
-    "is_non_overlapping_and_dense_indicator",
-    "is_nested_int",
-    "is_symbol_binding_fx_node",
-    "is_symbolic",
-    # torch.fx.experimental.unification.core
    "reify",
    # torch.fx.experimental.unification.match
    "edge",
@ -936,24 +868,6 @@ coverage_ignore_functions = [
    "reverse_dict",
    # torch.fx.experimental.unification.multipledispatch.variadic
    "isvariadic",
-    # torch.fx.experimental.unification.unification_tools
-    "assoc",
-    "assoc_in",
-    "dissoc",
-    "first",
-    "get_in",
-    "getter",
-    "groupby",
-    "itemfilter",
-    "itemmap",
-    "keyfilter",
-    "keymap",
-    "merge",
-    "merge_with",
-    "update_in",
-    "valfilter",
-    "valmap",
-    # torch.fx.experimental.unification.utils
    "freeze",
    "hashable",
    "raises",
@ -3195,6 +3109,11 @@ autodoc_type_aliases = {
 # Enable overriding of function signatures in the first line of the docstring.
 autodoc_docstring_signature = True

+# Exclude inherited IntEnum methods that have RST formatting issues in their docstrings
+autodoc_default_options = {
+    "exclude-members": "from_bytes, to_bytes",
+}
+
 # -- katex javascript in header
 #
 #    def setup(app):
--- a/docs/source/fx.experimental.md
+++ b/docs/source/fx.experimental.md
@ -12,6 +12,37 @@ These APIs are experimental and subject to change without notice.
 .. autoclass:: torch.fx.experimental.sym_node.DynamicInt
 ```

+## torch.fx.experimental.sym_node
+
+```{eval-rst}
+.. currentmodule:: torch.fx.experimental.sym_node
+```
+
+```{eval-rst}
+.. automodule:: torch.fx.experimental.sym_node
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    is_channels_last_contiguous_2d
+    is_channels_last_contiguous_3d
+    is_channels_last_strides_2d
+    is_channels_last_strides_3d
+    is_contiguous
+    is_non_overlapping_and_dense_indicator
+    method_to_operator
+    sympy_is_channels_last_contiguous_2d
+    sympy_is_channels_last_contiguous_3d
+    sympy_is_channels_last_strides_2d
+    sympy_is_channels_last_strides_3d
+    sympy_is_channels_last_strides_generic
+    sympy_is_contiguous
+    sympy_is_contiguous_generic
+```
+
 ## torch.fx.experimental.symbolic_shapes

 ```{eval-rst}
@ -69,6 +100,25 @@ These APIs are experimental and subject to change without notice.
    rebind_unbacked
    resolve_unbacked_bindings
    is_accessor_node
+    cast_symbool_to_symint_guardless
+    create_contiguous
+    error
+    eval_guards
+    eval_is_non_overlapping_and_dense
+    find_symbol_binding_fx_nodes
+    free_symbols
+    free_unbacked_symbols
+    fx_placeholder_targets
+    fx_placeholder_vals
+    guard_bool
+    guard_float
+    guard_int
+    guard_scalar
+    has_hint
+    has_symbolic_sizes_strides
+    is_nested_int
+    is_symbol_binding_fx_node
+    is_symbolic
 ```

 ## torch.fx.experimental.proxy_tensor
@ -91,4 +141,46 @@ These APIs are experimental and subject to change without notice.
    get_proxy_mode
    maybe_enable_thunkify
    maybe_disable_thunkify
+    decompose
+    disable_autocast_cache
+    disable_proxy_modes_tracing
+    extract_val
+    fake_signature
+    fetch_object_proxy
+    fetch_sym_proxy
+    has_proxy_slot
+    is_sym_node
+    maybe_handle_decomp
+    proxy_call
+    set_meta
+    set_original_aten_op
+    set_proxy_slot
+    snapshot_fake
 ```
+
+## torch.fx.experimental.unification.unification_tools
+
+```{eval-rst}
+.. currentmodule:: torch.fx.experimental.unification.unification_tools
+```
+
+```{eval-rst}
+.. automodule:: torch.fx.experimental.unification.unification_tools
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    assoc
+    assoc_in
+    dissoc
+    first
+    keyfilter
+    keymap
+    merge
+    merge_with
+    update_in
+    valfilter
+    valmap
--- a/docs/source/fx.md
+++ b/docs/source/fx.md
@ -1134,7 +1134,6 @@ The set of leaf modules can be customized by overriding
 .. py:module:: torch.fx.experimental.refinement_types
 .. py:module:: torch.fx.experimental.rewriter
 .. py:module:: torch.fx.experimental.schema_type_annotation
-.. py:module:: torch.fx.experimental.sym_node
 .. py:module:: torch.fx.experimental.unification.core
 .. py:module:: torch.fx.experimental.unification.dispatch
 .. py:module:: torch.fx.experimental.unification.match
@ -1144,7 +1143,6 @@ The set of leaf modules can be customized by overriding
 .. py:module:: torch.fx.experimental.unification.multipledispatch.dispatcher
 .. py:module:: torch.fx.experimental.unification.multipledispatch.utils
 .. py:module:: torch.fx.experimental.unification.multipledispatch.variadic
-.. py:module:: torch.fx.experimental.unification.unification_tools
 .. py:module:: torch.fx.experimental.unification.utils
 .. py:module:: torch.fx.experimental.unification.variable
 .. py:module:: torch.fx.experimental.unify_refinements
--- a/docs/source/notes/cuda.rst
+++ b/docs/source/notes/cuda.rst
@ -619,6 +619,10 @@ Available options:
  and reallocate buffers across multiple streams, especially when the capture DAG frequently
  reaches joined frontiers.

+* ``per_process_memory_fraction`` option limits the amount of memory that can be allocated
+  on all the CUDA devices to a specified fraction of the available memory. This is a value
+  between 0 and 1. Attempting to allocate more memory will raise an out of memory error.
+
 .. note::

    Some stats reported by the
--- a/docs/source/notes/libtorch_stable_abi.md
+++ b/docs/source/notes/libtorch_stable_abi.md
@ -46,6 +46,108 @@ These headers are promised to be ABI stable across releases and adhere to a stro
 Unless absolutely necessary, we recommend the high-level C++ API in `torch/csrc/stable`
 which will handle all the rough edges of the C API for the user.

+## Migrating your kernel to the LibTorch stable ABI
+
+If you'd like your kernel to be ABI stable with LibTorch, meaning you'd the ability to build for one version and run on another, your kernel must only use the limited stable ABI. This following section goes through some steps of migrating an existing kernel and APIs we imagine you would need to swap over.
+
+Firstly, instead of registering kernels through `TORCH_LIBRARY`, LibTorch ABI stable kernels must be registered via `STABLE_TORCH_LIBRARY`. Note that, for the time being, implementations registered via `STABLE_TORCH_LIBRARY` must be boxed unlike `TORCH_LIBRARY`. See the simple example below or our docs on [Stack-based APIs](stack-based-apis) for more details. For kernels that are registered via `pybind`, before using the stable ABI, it would be useful to migrate to register them via `TORCH_LIBRARY`.
+
+While previously your kernels might have included APIs from `<torch/*.h>` (for example, `<torch/all.h>`), they are now limited to including from the 3 categories of headers mentioned above (`torch/csrc/stable/*.h`, `torch/headeronly/*.h` and the stable C headers). This means that your extension should no longer use any utilities from the `at::` or `c10::` namespaces but instead use their replacements in `torch::stable` and `torch::headeronly`. To provide a couple examples of the necessary migrations:
+- all uses of `at::Tensor` must be replaced with `torch::stable::Tensor`
+- all uses of `TORCH_CHECK` must be replaced with `STD_TORCH_CHECK`
+- all uses of `at::kCUDA` must be replaced with `torch::headeronly::kCUDA` etc.
+- native functions such as `at::pad` must be replaced with `torch::stable::pad`
+- native functions that are called as Tensor methods (e.g., `Tensor.pad`) must be replaced with the ATen variant through `torch::stable::pad`.
+
+As mentioned above, the LibTorch stable ABI is still under development. If there is any API or feature you would like to see added to the stable ABI/`torch::headeronly`/`torch::stable`, please file a request through a [new issue on the PyTorch repo](https://github.com/pytorch/pytorch/issues).
+
+Below is a simple example of migrating an existing kernel that uses `TORCH_LIBRARY` to the stable ABI (`TORCH_STABLE_LIBRARY`). For a larger end to end example you can take a look at the FA3 repository. Specifically the diff between [`flash_api.cpp`](https://github.com/Dao-AILab/flash-attention/blob/ad70a007e6287d4f7e766f94bcf2f9a813f20f6b/hopper/flash_api.cpp#L1) and the stable variant [`flash_api_stable.cpp`](https://github.com/Dao-AILab/flash-attention/blob/ad70a007e6287d4f7e766f94bcf2f9a813f20f6b/hopper/flash_api_stable.cpp#L1).
+
+
+### Original Version with `TORCH_LIBRARY`
+
+```cpp
+// original_kernel.cpp - Using TORCH_LIBRARY (not stable ABI)
+#include <torch/torch.h>
+#include <ATen/ATen.h>
+
+namespace myops {
+
+// Simple kernel that adds a scalar value to each element of a tensor
+at::Tensor add_scalar(const at::Tensor& input, double scalar) {
+  TORCH_CHECK(input.scalar_type() == at::kFloat, "Input must be float32");
+
+  return input.add(scalar);
+}
+
+// Register the operator
+TORCH_LIBRARY(myops, m) {
+  m.def("add_scalar(Tensor input, float scalar) -> Tensor", &add_scalar);
+}
+
+// Register the implementation
+TORCH_LIBRARY_IMPL(myops, CompositeExplicitAutograd, m) {
+  m.impl("add_scalar", &add_scalar);
+}
+
+} // namespace myops
+```
+
+### Migrated Version with `STABLE_TORCH_LIBRARY`
+
+```cpp
+// stable_kernel.cpp - Using STABLE_TORCH_LIBRARY (stable ABI)
+
+// (1) Don't include <torch/torch.h> <ATen/ATen.h>
+//     only include APIs from torch/csrc/stable, torch/headeronly and C-shims
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor_struct.h>
+#include <torch/csrc/stable/ops.h>
+#include <torch/csrc/stable/stableivalue_conversions.h>
+#include <torch/headeronly/core/ScalarType.h>
+#include <torch/headeronly/macros/Macros.h>
+
+namespace myops {
+
+// Simple kernel that adds a scalar value to each element of a tensor
+torch::stable::Tensor add_scalar(const torch::stable::Tensor& input, double scalar) {
+  // (2) use STD_TORCH_CHECK instead of TORCH_CHECK
+  STD_TORCH_CHECK(
+      // (3) use torch::headeronly::kFloat instead of at:kFloat
+      input.scalar_type() == torch::headeronly::kFloat,
+      "Input must be float32");
+
+  // (4) Use stable ops namespace instead of input.add
+  return torch::stable::add(input, scalar);
+}
+
+// (5) Add Boxed wrapper required for STABLE_TORCH_LIBRARY
+void boxed_add_scalar(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+  // Extract arguments from stack using `to<T>`
+  auto input = to<torch::stable::Tensor>(stack[0]);
+  auto scalar = to<double>(stack[1]);
+
+  // Call the actual kernel
+  auto result = add_scalar(input, scalar);
+
+  // Put result back on stack using `from()`
+  // Stack slot 0 now holds the return value
+  stack[0] = from(result);
+}
+
+// (6) Register the operator using STABLE_TORCH_LIBRARY
+STABLE_TORCH_LIBRARY(myops, m) {
+  m.def("add_scalar(Tensor input, float scalar) -> Tensor", &boxed_add_scalar);
+}
+
+// (7) Register the implementation using STABLE_TORCH_LIBRARY_IMPL
+STABLE_TORCH_LIBRARY_IMPL(myops, CompositeExplicitAutograd, m) {
+  m.impl("add_scalar", &boxed_add_scalar);
+}
+
+} // namespace myops
+```
+

 ## How are objects passed across the ABI boundary when interacting with the dispatcher?

@ -109,6 +211,7 @@ There are two invariants for the stack:
    a. When calling a stack-based API, you must give owning references to the calling stack and steal references from the returned stack.
    b. When registering your function to be called with a stack, you must steal references from your argument stack and push onto the stack new references.

+(stack-based-apis)=
 ### Stack-based APIs

 The above is relevant in two places:
--- a/docs/source/quantization-support.md
+++ b/docs/source/quantization-support.md
@ -134,6 +134,23 @@ Quantization to work with this as well.
    ObservationType
 ```

+## torch.ao.quantization.backend_config.utils
+```{eval-rst}
+.. currentmodule:: torch.ao.quantization.backend_config.utils
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    entry_to_pretty_str
+    pattern_to_human_readable
+    remove_boolean_dispatch_from_name
+
+```
+
 ## torch.ao.quantization.fx.custom_config

 This module contains a few CustomConfig classes that's used in both eager mode and FX graph mode quantization
@ -154,6 +171,30 @@ This module contains a few CustomConfig classes that's used in both eager mode a
    StandaloneModuleConfigEntry
 ```

+## torch.ao.quantization.fx.utils
+
+```{eval-rst}
+.. currentmodule:: torch.ao.quantization.fx.utils
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    all_node_args_except_first
+    all_node_args_have_no_tensors
+    collect_producer_nodes
+    create_getattr_from_value
+    create_node_from_old_node_preserve_meta
+    graph_module_from_producer_nodes
+    maybe_get_next_module
+    node_arg_is_bias
+    node_arg_is_weight
+    return_arg_list
+```
+
 ## torch.ao.quantization.quantizer

 ```{eval-rst}
@ -253,7 +294,6 @@ regular full-precision tensor.
 .. autosummary::
    :toctree: generated
    :nosignatures:
-    :template: classtemplate.rst

    view
    as_strided
--- a/test/ao/sparsity/test_scheduler.py
+++ b/test/ao/sparsity/test_scheduler.py
@ -75,6 +75,7 @@ class TestScheduler(TestCase):

 class TestCubicScheduler(TestCase):
    def setUp(self):
+        super().setUp()
        self.model_sparse_config = [
            {"tensor_fqn": "0.weight", "sparsity_level": 0.8},
            {"tensor_fqn": "2.weight", "sparsity_level": 0.4},
--- a/test/backends/xeon/test_launch.py
+++ b/test/backends/xeon/test_launch.py
@ -11,6 +11,7 @@ from torch.testing._internal.common_utils import IS_LINUX, run_tests, TestCase
@unittest.skipIf(not IS_LINUX, "Only works on linux")
 class TestTorchrun(TestCase):
    def setUp(self):
+        super().setUp()
        self._test_dir = tempfile.mkdtemp(prefix=self.__class__.__name__)

    def tearDown(self):
--- a/test/cpp/aoti_abi_check/CMakeLists.txt
+++ b/test/cpp/aoti_abi_check/CMakeLists.txt
@ -10,6 +10,8 @@ set(AOTI_ABI_CHECK_TEST_SRCS
  ${AOTI_ABI_CHECK_TEST_ROOT}/main.cpp
  ${AOTI_ABI_CHECK_TEST_ROOT}/test_cast.cpp
  ${AOTI_ABI_CHECK_TEST_ROOT}/test_devicetype.cpp
+  ${AOTI_ABI_CHECK_TEST_ROOT}/test_dispatch.cpp
+  ${AOTI_ABI_CHECK_TEST_ROOT}/test_dispatch_v2.cpp
  ${AOTI_ABI_CHECK_TEST_ROOT}/test_dtype.cpp
  ${AOTI_ABI_CHECK_TEST_ROOT}/test_exception.cpp
  ${AOTI_ABI_CHECK_TEST_ROOT}/test_headeronlyarrayref.cpp
--- a/test/cpp/aoti_abi_check/test_dispatch.cpp
+++ b/test/cpp/aoti_abi_check/test_dispatch.cpp
@ -0,0 +1,82 @@
+#include <gtest/gtest.h>
+
+#include <torch/headeronly/core/Dispatch.h>
+#include <torch/headeronly/core/Dispatch_v2.h>
+
+// MY_PRIVATE_CHECK_SELECTIVE_BUILD is a prelude to case block. For
+// testing, we do nothing:
+#define MY_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type) /* empty */
+
+#define MY_PRIVATE_CASE_TYPE_USING_HINT(...) \
+  THO_PRIVATE_CASE_TYPE_USING_HINT_TMPL(     \
+      MY_PRIVATE_CHECK_SELECTIVE_BUILD, __VA_ARGS__)
+
+#define MY_DISPATCH_CASE(...) \
+  THO_DISPATCH_CASE_TMPL(MY_PRIVATE_CASE_TYPE_USING_HINT, __VA_ARGS__)
+
+// MY_RECORD_KERNEL_FUNCTION_DTYPE is a prelude to switch
+// statement. For testing, we just avoid unused variable warning:
+#define MY_RECORD_KERNEL_FUNCTION_DTYPE(DISPATCHNAME, ENUMTYPE) \
+  (void)DISPATCHNAME
+
+// MY_CHECK_NOT_IMPLEMENTED is called in switch default block. For
+// testing, we count case mismatches:
+#define MY_CHECK_NOT_IMPLEMENTED(...) default_count++
+
+#define MY_DISPATCH_SWITCH(...) \
+  THO_DISPATCH_SWITCH_TMPL(     \
+      MY_RECORD_KERNEL_FUNCTION_DTYPE, MY_CHECK_NOT_IMPLEMENTED, __VA_ARGS__)
+
+// MY_CASE_FUNCTION is called in a case block. For testing, we count
+// case matches and ensure that scalar_t/index_t type is defined:
+#define MY_CASE_FUNCTION \
+  [&] {                  \
+    count++;             \
+    scalar_t tmp;        \
+    (void)tmp;           \
+  }
+#define MY_INDEX_CASE_FUNCTION \
+  [&] {                        \
+    count++;                   \
+    index_t tmp;               \
+    (void)tmp;                 \
+  }
+
+#define DEFINE_ITEM(TYPE, SCALARTYPE) ScalarType::SCALARTYPE,
+
+#define MY_DISPATCH_V2(TYPE, NAME, BODY, ...) \
+  THO_DISPATCH_V2_TMPL(                       \
+      MY_DISPATCH_SWITCH,                     \
+      MY_DISPATCH_CASE,                       \
+      TYPE,                                   \
+      NAME,                                   \
+      AT_WRAP(BODY),                          \
+      __VA_ARGS__)
+
+#define TEST_DISPATCH_V2(NAME, EXPECTEDCOUNT, ...)                             \
+  TEST(TestDispatchV2, NAME) {                                                 \
+    using torch::headeronly::ScalarType;                                       \
+    using torch::headeronly::impl::ScalarTypeToCPPTypeT;                       \
+    int8_t total_count = 0;                                                    \
+    int8_t count = 0;                                                          \
+    int8_t default_count = 0;                                                  \
+    for (ScalarType t :                                                        \
+         {AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_ITEM)}) {       \
+      total_count++;                                                           \
+      MY_DISPATCH_V2(t, "test_my_dispatch_v2", MY_CASE_FUNCTION, __VA_ARGS__); \
+    }                                                                          \
+    EXPECT_EQ(count, EXPECTEDCOUNT);                                           \
+    EXPECT_EQ(default_count + count, total_count);                             \
+  }
+
+TEST_DISPATCH_V2(AT_FLOAT8_TYPES_, 5, AT_FLOAT8_TYPES);
+TEST_DISPATCH_V2(AT_INTEGRAL_TYPES_, 5, AT_INTEGRAL_TYPES);
+TEST_DISPATCH_V2(AT_FLOATING_TYPES_, 2, AT_FLOATING_TYPES);
+TEST_DISPATCH_V2(AT_BAREBONES_UNSIGNED_TYPES_, 3, AT_BAREBONES_UNSIGNED_TYPES);
+TEST_DISPATCH_V2(AT_INTEGRAL_TYPES_V2_, 8, AT_INTEGRAL_TYPES_V2);
+TEST_DISPATCH_V2(AT_COMPLEX_TYPES_, 2, AT_COMPLEX_TYPES);
+TEST_DISPATCH_V2(AT_QINT_TYPES_, 3, AT_QINT_TYPES);
+TEST_DISPATCH_V2(AT_ALL_TYPES_, 7, AT_ALL_TYPES);
+TEST_DISPATCH_V2(AT_ALL_TYPES_AND_COMPLEX_, 9, AT_ALL_TYPES_AND_COMPLEX);
+
+#undef DEFINE_ITEM
--- a/test/cpp/aoti_abi_check/test_dispatch_v2.cpp
+++ b/test/cpp/aoti_abi_check/test_dispatch_v2.cpp
@ -0,0 +1,45 @@
+#include <gtest/gtest.h>
+#include <torch/headeronly/core/Dispatch_v2.h>
+#include <torch/headeronly/util/Exception.h>
+
+#define DEFINE_ITEM(TYPE, SCALARTYPE) ScalarType::SCALARTYPE,
+
+#define TEST_DISPATCH_V2(NAME, EXPECTEDCOUNT, ...)                       \
+  TEST(TestThoDispatchV2, NAME) {                                        \
+    using torch::headeronly::ScalarType;                                 \
+    using torch::headeronly::impl::ScalarTypeToCPPTypeT;                 \
+    int8_t total_count = 0;                                              \
+    int8_t count = 0;                                                    \
+    int8_t default_count = 0;                                            \
+    for (ScalarType t :                                                  \
+         {AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_AND_QINTS(DEFINE_ITEM)}) { \
+      total_count++;                                                     \
+      try {                                                              \
+        THO_DISPATCH_V2(                                                 \
+            t,                                                           \
+            "test_tho_dispatch_v2",                                      \
+            [&] {                                                        \
+              count++;                                                   \
+              scalar_t tmp;                                              \
+              (void)tmp;                                                 \
+            },                                                           \
+            __VA_ARGS__);                                                \
+      } catch (...) {                                                    \
+        default_count++; /* counts mismatches */                         \
+      }                                                                  \
+    }                                                                    \
+    EXPECT_EQ(count, EXPECTEDCOUNT);                                     \
+    EXPECT_EQ(default_count + count, total_count);                       \
+  }
+
+TEST_DISPATCH_V2(AT_FLOAT8_TYPES_, 5, AT_FLOAT8_TYPES);
+TEST_DISPATCH_V2(AT_INTEGRAL_TYPES_, 5, AT_INTEGRAL_TYPES);
+TEST_DISPATCH_V2(AT_FLOATING_TYPES_, 2, AT_FLOATING_TYPES);
+TEST_DISPATCH_V2(AT_BAREBONES_UNSIGNED_TYPES_, 3, AT_BAREBONES_UNSIGNED_TYPES);
+TEST_DISPATCH_V2(AT_INTEGRAL_TYPES_V2_, 8, AT_INTEGRAL_TYPES_V2);
+TEST_DISPATCH_V2(AT_COMPLEX_TYPES_, 2, AT_COMPLEX_TYPES);
+TEST_DISPATCH_V2(AT_QINT_TYPES_, 3, AT_QINT_TYPES);
+TEST_DISPATCH_V2(AT_ALL_TYPES_, 7, AT_ALL_TYPES);
+TEST_DISPATCH_V2(AT_ALL_TYPES_AND_COMPLEX_, 9, AT_ALL_TYPES_AND_COMPLEX);
+
+#undef DEFINE_ITEM
--- a/test/cpp/api/init_baseline.py
+++ b/test/cpp/api/init_baseline.py
@ -64,7 +64,7 @@ def run(initializer):

 def main():
    initializer_parameter_map = {}
-    for initializer in INITIALIZERS.keys():
+    for initializer in INITIALIZERS:
        sys.stderr.write(f"Evaluating {initializer} ...\n")
        initializer_parameter_map[initializer] = run(initializer)

--- a/test/cpp/api/optim_baseline.py
+++ b/test/cpp/api/optim_baseline.py
@ -130,7 +130,7 @@ def main():
    options = parser.parse_args()

    optimizer_parameter_map = {}
-    for optimizer in OPTIMIZERS.keys():
+    for optimizer in OPTIMIZERS:
        sys.stderr.write(f"Evaluating {optimizer} ...\n")
        optimizer_parameter_map[optimizer] = run(
            optimizer, options.iterations, options.sample_every
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
@ -67,13 +67,13 @@ Tensor sgd_out_of_place(

 void boxed_sgd_out_of_place(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
  Tensor res = sgd_out_of_place(
-    to<Tensor>(stack[0]),
-    to<Tensor>(stack[1]),
-    float(to<double>(stack[2])),
-    to<double>(stack[3]),
-    to<bool>(stack[4]));
+    torch::stable::detail::to<Tensor>(stack[0]),
+    torch::stable::detail::to<Tensor>(stack[1]),
+    float(torch::stable::detail::to<double>(stack[2])),
+    torch::stable::detail::to<double>(stack[3]),
+    torch::stable::detail::to<bool>(stack[4]));

-  stack[0] = from(res);
+  stack[0] = torch::stable::detail::from(res);
 }

 STABLE_TORCH_LIBRARY(libtorch_agnostic, m) {
@ -89,8 +89,8 @@ Tensor identity(Tensor t) {
 }

 void boxed_identity(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  Tensor res = identity(to<Tensor>(stack[0]));
-  stack[0] = from(res);
+  Tensor res = identity(torch::stable::detail::to<Tensor>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
 }

 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
@ -108,14 +108,14 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
 Tensor my_abs(Tensor t) {
  const auto num_args = 1;
  StableIValue stack[num_args];
-  stack[0] = from(t);
+  stack[0] = torch::stable::detail::from(t);
  aoti_torch_call_dispatcher("aten::abs", "", stack);
-  return to<Tensor>(stack[0]);
+  return torch::stable::detail::to<Tensor>(stack[0]);
 }

 void boxed_my_abs(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  Tensor tensor_res = my_abs(to<Tensor>(stack[0]));
-  stack[0] = from(tensor_res);
+  Tensor tensor_res = my_abs(torch::stable::detail::to<Tensor>(stack[0]));
+  stack[0] = torch::stable::detail::from(tensor_res);
 }

 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
@ -132,21 +132,21 @@ Tensor my_ones_like(Tensor t, StableIValue device) {

  auto mf = aoti_torch_memory_format_contiguous_format();

-  stack[0] = from(t);
-  stack[1] = from(std::optional(t.scalar_type()));    // dtype
-  stack[2] = from(std::nullopt);              // layout
-  stack[3] = from(std::optional(device));     // device
-  stack[4] = from(std::optional(false));      // pin_memory
-  stack[5] = from(std::optional(mf));         // memory_format
+  stack[0] = torch::stable::detail::from(t);
+  stack[1] = torch::stable::detail::from(std::optional(t.scalar_type()));    // dtype
+  stack[2] = torch::stable::detail::from(std::nullopt);              // layout
+  stack[3] = torch::stable::detail::from(std::optional(device));     // device
+  stack[4] = torch::stable::detail::from(std::optional(false));      // pin_memory
+  stack[5] = torch::stable::detail::from(std::optional(mf));         // memory_format

  aoti_torch_call_dispatcher("aten::ones_like", "", stack);

-  return to<Tensor>(stack[0]);
+  return torch::stable::detail::to<Tensor>(stack[0]);
 }

 void boxed_my_ones_like(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  Tensor res = my_ones_like(to<Tensor>(stack[0]), stack[1]);
-  stack[0] = from(res);
+  Tensor res = my_ones_like(torch::stable::detail::to<Tensor>(stack[0]), stack[1]);
+  stack[0] = torch::stable::detail::from(res);
 }

 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
@ -159,28 +159,28 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {

 std::tuple<Tensor, Tensor, bool> exp_neg_is_leaf(Tensor t1, Tensor t2, Tensor t3) {
  StableIValue stack_exp[1];
-  stack_exp[0] = from(t1);
+  stack_exp[0] = torch::stable::detail::from(t1);
  aoti_torch_call_dispatcher("aten::exp", "", stack_exp);

  StableIValue stack_neg[1];
-  stack_neg[0] = from(t2);
+  stack_neg[0] = torch::stable::detail::from(t2);
  aoti_torch_call_dispatcher("aten::neg", "", stack_neg);

  StableIValue stack_is_leaf[1];
-  stack_is_leaf[0] = from(t3);
+  stack_is_leaf[0] = torch::stable::detail::from(t3);
  aoti_torch_call_dispatcher("aten::is_leaf", "", stack_is_leaf);

  return std::make_tuple(
-    to<Tensor>(stack_exp[0]),
-    to<Tensor>(stack_neg[0]),
-    to<bool>(stack_is_leaf[0]));
+    torch::stable::detail::to<Tensor>(stack_exp[0]),
+    torch::stable::detail::to<Tensor>(stack_neg[0]),
+    torch::stable::detail::to<bool>(stack_is_leaf[0]));
 }

 void boxed_exp_neg_is_leaf(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  auto tuple = exp_neg_is_leaf(to<Tensor>(stack[0]), to<Tensor>(stack[1]), to<Tensor>(stack[2]));
-  stack[0] = from(std::get<0>(tuple));
-  stack[1] = from(std::get<1>(tuple));
-  stack[2] = from(std::get<2>(tuple));
+  auto tuple = exp_neg_is_leaf(torch::stable::detail::to<Tensor>(stack[0]), torch::stable::detail::to<Tensor>(stack[1]), torch::stable::detail::to<Tensor>(stack[2]));
+  stack[0] = torch::stable::detail::from(std::get<0>(tuple));
+  stack[1] = torch::stable::detail::from(std::get<1>(tuple));
+  stack[2] = torch::stable::detail::from(std::get<2>(tuple));
 }

 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
@ -193,15 +193,15 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {

 Tensor neg_exp(Tensor t) {
  StableIValue stack[1];
-  stack[0] = from(t);
+  stack[0] = torch::stable::detail::from(t);
  aoti_torch_call_dispatcher("aten::exp", "", stack);
  aoti_torch_call_dispatcher("aten::neg", "", stack);
-  return to<Tensor>(stack[0]);
+  return torch::stable::detail::to<Tensor>(stack[0]);
 }

 void boxed_neg_exp(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  Tensor res = neg_exp(to<Tensor>(stack[0]));
-  stack[0] = from(res);
+  Tensor res = neg_exp(torch::stable::detail::to<Tensor>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
 }

 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
@ -214,10 +214,10 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {

 Tensor divide_neg_exp(Tensor t) {
  StableIValue stack_neg[1];
-  stack_neg[0] = from(t);
+  stack_neg[0] = torch::stable::detail::from(t);

  StableIValue stack_exp[1];
-  stack_exp[0] = from(t);
+  stack_exp[0] = torch::stable::detail::from(t);
  aoti_torch_call_dispatcher("aten::exp", "", stack_exp);
  aoti_torch_call_dispatcher("aten::neg", "", stack_neg);

@ -225,12 +225,12 @@ Tensor divide_neg_exp(Tensor t) {
  stack_div[0] = stack_neg[0];
  stack_div[1] = stack_exp[0];
  aoti_torch_call_dispatcher("aten::divide", "Tensor", stack_div);
-  return to<Tensor>(stack_div[0]);
+  return torch::stable::detail::to<Tensor>(stack_div[0]);
 }

 void boxed_divide_neg_exp(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  Tensor res = divide_neg_exp(to<Tensor>(stack[0]));
-  stack[0] = from(res);
+  Tensor res = divide_neg_exp(torch::stable::detail::to<Tensor>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
 }

 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
@ -246,8 +246,8 @@ bool is_contiguous(Tensor t) {
 }

 void boxed_is_contiguous(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  bool res = is_contiguous(to<Tensor>(stack[0]));
-  stack[0] = from(res);
+  bool res = is_contiguous(torch::stable::detail::to<Tensor>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
 }

 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
@ -263,9 +263,9 @@ Tensor my_transpose(Tensor t, int64_t dim0, int64_t dim1) {
 }

 void boxed_my_transpose(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  auto res = my_transpose(to<Tensor>(stack[0]), to<int64_t>(stack[1]), to<int64_t>(stack[2]));
+  auto res = my_transpose(torch::stable::detail::to<Tensor>(stack[0]), torch::stable::detail::to<int64_t>(stack[1]), torch::stable::detail::to<int64_t>(stack[2]));

-  stack[0] = from(res);
+  stack[0] = torch::stable::detail::from(res);
 }

 Tensor my_empty_like(Tensor t) {
@ -273,8 +273,8 @@ Tensor my_empty_like(Tensor t) {
 }

 void boxed_empty_like(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  auto res = my_empty_like(to<Tensor>(stack[0]));
-  stack[0] = from(res);
+  auto res = my_empty_like(torch::stable::detail::to<Tensor>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
 }

 bool my_is_cpu(Tensor t) {
@ -283,8 +283,8 @@ bool my_is_cpu(Tensor t) {


 void boxed_my_is_cpu(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  auto res = my_is_cpu(to<Tensor>(stack[0]));
-  stack[0] = from(res);
+  auto res = my_is_cpu(torch::stable::detail::to<Tensor>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
 }

 Tensor fill_infinity(Tensor t) {
@ -296,8 +296,8 @@ void boxed_fill_infinity(
    StableIValue* stack,
    uint64_t num_args,
    uint64_t num_outputs) {
-  auto res = fill_infinity(to<Tensor>(stack[0]));
-  stack[0] = from(res);
+  auto res = fill_infinity(torch::stable::detail::to<Tensor>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
 }

 Tensor my_pad(Tensor t) {
@ -310,8 +310,8 @@ void boxed_my_pad(
    StableIValue* stack,
    uint64_t num_args,
    uint64_t num_outputs) {
-  auto res = my_pad(to<Tensor>(stack[0]));
-  stack[0] = from(res);
+  auto res = my_pad(torch::stable::detail::to<Tensor>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
 }

 Tensor my_narrow(Tensor t, int64_t dim, int64_t start, int64_t length) {
@ -323,11 +323,11 @@ void boxed_my_narrow(
    uint64_t num_args,
    uint64_t num_outputs) {
  auto res = my_narrow(
-      to<Tensor>(stack[0]),
-      to<int64_t>(stack[1]),
-      to<int64_t>(stack[2]),
-      to<int64_t>(stack[3]));
-  stack[0] = from(res);
+      torch::stable::detail::to<Tensor>(stack[0]),
+      torch::stable::detail::to<int64_t>(stack[1]),
+      torch::stable::detail::to<int64_t>(stack[2]),
+      torch::stable::detail::to<int64_t>(stack[3]));
+  stack[0] = torch::stable::detail::from(res);
 }

 Tensor my_new_empty_dtype_variant(Tensor t) {
@ -342,8 +342,8 @@ Tensor my_new_empty_dtype_variant(Tensor t) {
 }

 void boxed_my_new_empty_dtype_variant(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  auto res = my_new_empty_dtype_variant(to<Tensor>(stack[0]));
-  stack[0] = from(res);
+  auto res = my_new_empty_dtype_variant(torch::stable::detail::to<Tensor>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
 }

 Tensor my_new_zeros_dtype_variant(Tensor t) {
@ -352,8 +352,8 @@ Tensor my_new_zeros_dtype_variant(Tensor t) {
 }

 void boxed_my_new_zeros_dtype_variant(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  auto res = my_new_zeros_dtype_variant(to<Tensor>(stack[0]));
-  stack[0] = from(res);
+  auto res = my_new_zeros_dtype_variant(torch::stable::detail::to<Tensor>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
 }

 Tensor my_copy_(Tensor dst, Tensor src, bool non_blocking) {
@ -361,8 +361,8 @@ Tensor my_copy_(Tensor dst, Tensor src, bool non_blocking) {
 }

 void boxed_my_copy_(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  Tensor tensor_res = my_copy_(to<Tensor>(stack[0]), to<Tensor>(stack[1]), to<bool>(stack[2]));
-  stack[0] = from(tensor_res);
+  Tensor tensor_res = my_copy_(torch::stable::detail::to<Tensor>(stack[0]), torch::stable::detail::to<Tensor>(stack[1]), torch::stable::detail::to<bool>(stack[2]));
+  stack[0] = torch::stable::detail::from(tensor_res);
 }

 Tensor my_clone(Tensor t) {
@ -370,8 +370,8 @@ Tensor my_clone(Tensor t) {
 }

 void boxed_my_clone(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  Tensor tensor_res = my_clone(to<Tensor>(stack[0]));
-  stack[0] = from(tensor_res);
+  Tensor tensor_res = my_clone(torch::stable::detail::to<Tensor>(stack[0]));
+  stack[0] = torch::stable::detail::from(tensor_res);
 }


@ -408,8 +408,8 @@ Tensor my_zero_(Tensor t) {
 }

 void boxed_my_zero_(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  auto res = my_zero_(to<Tensor>(stack[0]));
-  stack[0] = from(res);
+  auto res = my_zero_(torch::stable::detail::to<Tensor>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
 }

 Tensor my_amax(Tensor t) {
@ -417,8 +417,8 @@ Tensor my_amax(Tensor t) {
 }

 void boxed_my_amax(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  auto res = my_amax(to<Tensor>(stack[0]));
-  stack[0] = from(res);
+  auto res = my_amax(torch::stable::detail::to<Tensor>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
 }

 Tensor my_amax_vec(Tensor t) {
@ -426,8 +426,8 @@ Tensor my_amax_vec(Tensor t) {
 }

 void boxed_my_amax_vec(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  auto res = my_amax_vec(to<Tensor>(stack[0]));
-  stack[0] = from(res);
+  auto res = my_amax_vec(torch::stable::detail::to<Tensor>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
 }

 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
@ -464,8 +464,8 @@ void boxed_test_default_constructor(
    StableIValue* stack,
    uint64_t num_args,
    uint64_t num_outputs) {
-  bool res = test_default_constructor(to<bool>(stack[0]));
-  stack[0] = from(res);
+  bool res = test_default_constructor(torch::stable::detail::to<bool>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
 }

 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
@ -478,6 +478,56 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
  m.impl("my_amax_vec", &boxed_my_amax_vec);
 }

+std::vector<Tensor> my__foreach_mul(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
+  std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
+  aoti_torch_call_dispatcher("aten::_foreach_mul", "List", stack.data());
+  return torch::stable::detail::to<std::vector<Tensor>>(stack[0]);
+}
+
+void boxed_my__foreach_mul(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+  // Why is the following NOT torch::stable::detail::to<HeaderOnlyArrayRef<Tensor>>(stack[0])? Because calling `to`
+  // on a StableIValue means that the result is owning its underlying data now! HeaderOnlyArrayRef
+  // is not owning, so it cannot safely steward the result of the torch::stable::detail::to<>.
+  auto res = my__foreach_mul(torch::stable::detail::to<std::vector<Tensor>>(stack[0]), torch::stable::detail::to<std::vector<Tensor>>(stack[1]));
+  stack[0] = torch::stable::detail::from(res);
+}
+
+void my__foreach_mul_(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
+  std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
+  aoti_torch_call_dispatcher("aten::_foreach_mul_", "List", stack.data());
+}
+
+void boxed_my__foreach_mul_(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+  my__foreach_mul_(torch::stable::detail::to<std::vector<Tensor>>(stack[0]), torch::stable::detail::to<std::vector<Tensor>>(stack[1]));
+}
+
+std::vector<Tensor> make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) {
+  // This function tests that my__foreach_mul can take in std::initializer_lists
+  // in addition to std::vectors.
+  Tensor t1_1 = my_clone(t1);
+  Tensor t1_2 = my_clone(t1);
+  Tensor t2_1 = my_clone(t2);
+  Tensor t2_2 = my_clone(t2);
+  return my__foreach_mul({t1_1, t2_1}, {t1_2, t2_2});
+}
+
+void boxed_make_tensor_clones_and_call_foreach(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+  auto res = make_tensor_clones_and_call_foreach(torch::stable::detail::to<Tensor>(stack[0]), torch::stable::detail::to<Tensor>(stack[1]));
+  stack[0] = torch::stable::detail::from(res);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+  m.def("my__foreach_mul(Tensor[] self, Tensor[] other) -> Tensor[]");
+  m.def("my__foreach_mul_(Tensor(a!)[] self, Tensor[] other) -> ()");
+  m.def("make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) -> Tensor[]");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+  m.impl("my__foreach_mul", &boxed_my__foreach_mul);
+  m.impl("my__foreach_mul_", &boxed_my__foreach_mul_);
+  m.impl("make_tensor_clones_and_call_foreach", &boxed_make_tensor_clones_and_call_foreach);
+}
+
 // Test functions for torch::stable::accelerator APIs

 #ifdef LAE_USE_CUDA
@ -500,8 +550,8 @@ void boxed_test_device_guard(
    StableIValue* stack,
    uint64_t num_args,
    uint64_t num_outputs) {
-  int res = test_device_guard(static_cast<int64_t>(to<int64_t>(stack[0])));
-  stack[0] = from(res);
+  int res = test_device_guard(static_cast<int64_t>(torch::stable::detail::to<int64_t>(stack[0])));
+  stack[0] = torch::stable::detail::from(res);
 }

 int64_t test_device_guard_set_index() {
@ -520,7 +570,7 @@ void boxed_test_device_guard_set_index(
    uint64_t num_args,
    uint64_t num_outputs) {
  int64_t res = test_device_guard_set_index();
-  stack[0] = from(res);
+  stack[0] = torch::stable::detail::from(res);
 }

 int64_t test_stream(int32_t device_index) {
@ -536,8 +586,8 @@ void boxed_test_stream(
    StableIValue* stack,
    uint64_t num_args,
    uint64_t num_outputs) {
-  int64_t res = test_stream(static_cast<int64_t>(to<int64_t>(stack[0])));
-  stack[0] = from(res);
+  int64_t res = test_stream(static_cast<int64_t>(torch::stable::detail::to<int64_t>(stack[0])));
+  stack[0] = torch::stable::detail::from(res);
 }

 int64_t test_get_current_device_index() {
@ -549,7 +599,7 @@ void boxed_test_get_current_device_index(
    uint64_t num_args,
    uint64_t num_outputs) {
  int64_t res = test_get_current_device_index();
-  stack[0] = from(res);
+  stack[0] = torch::stable::detail::from(res);
 }

 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
@ -565,4 +615,5 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
  m.impl("test_stream", &boxed_test_stream);
  m.impl("test_get_current_device_index", &boxed_test_get_current_device_index);
 }
+
 #endif // LAE_USE_CUDA
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
@ -333,3 +333,45 @@ def my_new_zeros_dtype_variant(t) -> Tensor:
    Returns: New zeros tensor
    """
    return torch.ops.libtorch_agnostic.my_new_zeros_dtype_variant.default(t)
+
+
+def my__foreach_mul_(tensors, others) -> ():
+    """
+    Updates tensors to be the result of pointwise multiplying with others.
+
+    Args:
+        tensors: list of tensors
+        others: list of tensors (with the same corresponding shapes as tensors)
+
+    Returns: nothing, tensors is updated in place.
+    """
+    torch.ops.libtorch_agnostic.my__foreach_mul_.default(tensors, others)
+
+
+def my__foreach_mul(tensors, others) -> list[Tensor]:
+    """
+    Returns a list of tensors that are the results of pointwise multiplying
+    tensors and others.
+
+    Args:
+        tensors: list of tensors
+        others: list of tensors (with the same corresponding shapes as tensors)
+
+    Returns: list of multiplied tensors
+    """
+    return torch.ops.libtorch_agnostic.my__foreach_mul.default(tensors, others)
+
+
+def make_tensor_clones_and_call_foreach(t1, t2) -> list[Tensor]:
+    """
+    Returns a list of 2 tensors corresponding to the square of the inputs.
+
+    Args:
+        t1: Tensor
+        t2: Tensor
+
+    Returns: list of [t1^2, t2^2]
+    """
+    return torch.ops.libtorch_agnostic.make_tensor_clones_and_call_foreach.default(
+        t1, t2
+    )
--- a/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
@ -367,6 +367,57 @@ if not IS_WINDOWS:
            self.assertNotEqual(result.data_ptr(), expected.data_ptr())
            self.assertEqual(result.stride(), expected.stride())

+        def test_my__foreach_mul_(self, device):
+            import libtorch_agnostic
+
+            N = 5
+            tensors = [torch.rand(32, 16, device=device) for _ in range(N)]
+            tensors_c = [t.clone() for t in tensors]
+            others = [torch.rand(32, 16, device=device) for _ in range(N)]
+
+            libtorch_agnostic.ops.my__foreach_mul_(tensors, others)
+            expected_values = torch._foreach_mul(tensors_c, others)
+
+            for tensor_t, expected_t in zip(tensors, expected_values):
+                self.assertEqual(tensor_t, expected_t)
+
+        def test_my__foreach_mul(self, device):
+            import libtorch_agnostic
+
+            N = 5
+            tensors = [torch.rand(32, 16, device=device) for _ in range(N)]
+            others = [torch.rand(32, 16, device=device) for _ in range(N)]
+
+            result = libtorch_agnostic.ops.my__foreach_mul(tensors, others)
+            expected = torch._foreach_mul(tensors, others)
+
+            for result_t, expected_t in zip(result, expected):
+                self.assertEqual(result_t, expected_t)
+
+            def _make_cuda_tensors(prior_mem):
+                cuda_res = libtorch_agnostic.ops.my__foreach_mul(tensors, others)
+                self.assertGreater(torch.cuda.memory_allocated(device), prior_mem)
+
+                expected = torch._foreach_mul(tensors, others)
+                for result_t, expected_t in zip(cuda_res, expected):
+                    self.assertEqual(result_t, expected_t)
+
+            if tensors[0].is_cuda:
+                init_mem = torch.cuda.memory_allocated(device)
+                for _ in range(3):
+                    _make_cuda_tensors(init_mem)
+                    curr_mem = torch.cuda.memory_allocated(device)
+                    self.assertEqual(curr_mem, init_mem)
+
+        def test_make_tensor_clones_and_call_foreach(self, device):
+            import libtorch_agnostic
+
+            t1 = torch.rand(2, 5, device=device)
+            t2 = torch.rand(3, 4, device=device)
+            result = libtorch_agnostic.ops.make_tensor_clones_and_call_foreach(t1, t2)
+            self.assertEqual(result[0], t1 * t1)
+            self.assertEqual(result[1], t2 * t2)
+
    instantiate_device_type_tests(TestLibtorchAgnostic, globals(), except_for=None)

 if __name__ == "__main__":
--- a/test/custom_backend/test_custom_backend.py
+++ b/test/custom_backend/test_custom_backend.py
@ -11,6 +11,7 @@ from torch.testing._internal.common_utils import run_tests, TestCase

 class TestCustomBackend(TestCase):
    def setUp(self):
+        super().setUp()
        # Load the library containing the custom backend.
        self.library_path = get_custom_backend_library_path()
        torch.ops.load_library(self.library_path)
--- a/test/custom_operator/test_custom_ops.py
+++ b/test/custom_operator/test_custom_ops.py
@ -18,6 +18,7 @@ torch.ops.import_module("pointwise")

 class TestCustomOperators(TestCase):
    def setUp(self):
+        super().setUp()
        self.library_path = get_custom_op_library_path()
        ops.load_library(self.library_path)

--- a/test/distributed/_composable/test_replicate_with_fsdp.py
+++ b/test/distributed/_composable/test_replicate_with_fsdp.py
@ -76,7 +76,7 @@ class ReplicateTest(MultiProcessTestCase):
            store=dist.FileStore(self.file_name, self.world_size),
        )

-    @skip_if_lt_x_gpu(2)
+    @skip_if_lt_x_gpu(4)
    def test_replicate_transformer(self):
        """
        This tests that replicate works on a transformer model with fully_shard and replicate layers
@ -126,7 +126,7 @@ class ReplicateTest(MultiProcessTestCase):
                for parameter in layer.parameters():
                    self.assertEqual(parameter.placements, (Shard(dim=0),))

-    @skip_if_lt_x_gpu(2)
+    @skip_if_lt_x_gpu(4)
    def test_replicate_transformer_managed_modules(self):
        """
        This tests that replicate managed modules works properly. In this test we use a Transformer Module with 3 layers,
@ -178,7 +178,7 @@ class ReplicateTest(MultiProcessTestCase):
        replicate_model = replicate(replicate_model)
        self.assertEqual(len(_get_managed_modules((replicate_model,))), 21)

-    @skip_if_lt_x_gpu(2)
+    @skip_if_lt_x_gpu(4)
    def test_replicate_tp_device_mesh(self):
        """
        This tests that a user can pass in a device mesh to replicate a module
@ -206,7 +206,7 @@ class ReplicateTest(MultiProcessTestCase):
                self.assertEqual(parameter.device_mesh.shape, (2,))
                self.assertEqual(parameter.placements, (Replicate(),))

-    @skip_if_lt_x_gpu(2)
+    @skip_if_lt_x_gpu(4)
    def test_train_replicate_fsdp(self):
        """
        Tests that replicate_model has the same behavior as original model when training
@ -253,7 +253,7 @@ class ReplicateTest(MultiProcessTestCase):
            self.assertEqual(replicate_loss, loss)
            check_sharded_parity(self, model, replicate_model)

-    @skip_if_lt_x_gpu(2)
+    @skip_if_lt_x_gpu(4)
    def test_train_parity_2d_mlp(self):
        """
        Verifies when a device mesh is passed in, the model has the same behavior as the original model when training
--- a/test/distributed/checkpoint/_experimental/test_builder.py
+++ b/test/distributed/checkpoint/_experimental/test_builder.py
@ -22,6 +22,7 @@ from torch.testing._internal.common_utils import run_tests, TestCase

 class TestMakeCheckpointer(TestCase):
    def setUp(self) -> None:
+        super().setUp()
        # Create a temporary directory for checkpoints
        self.temp_dir = tempfile.mkdtemp()

--- a/test/distributed/checkpoint/_experimental/test_checkpoint_process.py
+++ b/test/distributed/checkpoint/_experimental/test_checkpoint_process.py
@ -161,6 +161,7 @@ class TestCheckpointProcessConfig(TestCase):

 class TestCheckpointProcess(TestCase):
    def setUp(self) -> None:
+        super().setUp()
        """Set up common test fixtures."""
        self.rank_info = RankInfo(
            global_world_size=1,
--- a/test/distributed/checkpoint/_experimental/test_checkpoint_reader.py
+++ b/test/distributed/checkpoint/_experimental/test_checkpoint_reader.py
@ -14,6 +14,7 @@ from torch.testing._internal.common_utils import run_tests, TestCase

 class TestCheckpointReader(TestCase):
    def setUp(self):
+        super().setUp()
        # Create a temporary directory for test checkpoints
        self.temp_dir = tempfile.mkdtemp()

--- a/test/distributed/checkpoint/_experimental/test_checkpoint_writer.py
+++ b/test/distributed/checkpoint/_experimental/test_checkpoint_writer.py
@ -52,6 +52,7 @@ class TestCheckpointWriterConfig(TestCase):

 class TestCheckpointWriter(TestCase):
    def setUp(self):
+        super().setUp()
        # Create a temporary directory for test checkpoints
        self.temp_dir = tempfile.mkdtemp()

--- a/test/distributed/checkpoint/_experimental/test_checkpointer.py
+++ b/test/distributed/checkpoint/_experimental/test_checkpointer.py
@ -52,6 +52,7 @@ class TestCheckpointer(TestCase):
    """Parameterized tests that work with both sync and async checkpointers."""

    def setUp(self):
+        super().setUp()
        # Create a temporary directory for checkpoints
        self.temp_dir = tempfile.mkdtemp()

@ -397,6 +398,7 @@ class TestAsyncCheckpointerSpecific(TestCase):
    """Tests specific to AsyncCheckpointer functionality."""

    def setUp(self):
+        super().setUp()
        # Create a temporary directory for checkpoints
        self.temp_dir = tempfile.mkdtemp()

--- a/test/distributed/checkpoint/_experimental/test_staging.py
+++ b/test/distributed/checkpoint/_experimental/test_staging.py
@ -12,6 +12,7 @@ from torch.testing._internal.common_utils import requires_cuda, run_tests, TestC

 class TestDefaultStager(TestCase):
    def setUp(self) -> None:
+        super().setUp()
        # Create a test state dictionary with various data types
        self.state_dict = {
            "model": torch.nn.Linear(10, 5).state_dict(),
--- a/test/distributed/checkpoint/test_dtensor_resharding.py
+++ b/test/distributed/checkpoint/test_dtensor_resharding.py
@ -299,7 +299,7 @@ class TestDTensorReshardMeshChange(DTensorTestBase):

    @with_comms
    @with_temp_dir
-    @skip_if_lt_x_gpu(2)
+    @skip_if_lt_x_gpu(4)
    def test_dtensor_checkpoint_with_uneven_shards(self) -> None:
        """
        Saving a dtensor with uneven shards.
@ -436,6 +436,7 @@ class TestCheckpointableReshard(DTensorTestBase):

    @with_comms
    @with_temp_dir
+    @skip_if_lt_x_gpu(4)
    def test_uneven_reshard_with_checkpointable_api(self) -> None:
        """
        Saves a 1d distributed tensor that has shards with uneven sizes using Checkpointable API.
@ -498,6 +499,7 @@ class TestCheckpointableReshard(DTensorTestBase):

    @with_comms
    @with_temp_dir
+    @skip_if_lt_x_gpu(4)
    def test_uneven_reshard_with_dtensor_shards_wrapper_api(self) -> None:
        """
        Saves a 1d distributed tensor that has shards with uneven sizes using Checkpointable API.
--- a/test/distributed/checkpoint/test_hf_safetensor_e2e.py
+++ b/test/distributed/checkpoint/test_hf_safetensor_e2e.py
@ -208,7 +208,7 @@ class TestSingleRankSaveLoad(TestCase):

        # Create model.safetensors.index.json with weight mapping
        weight_map = {}
-        for key in quantized_checkpoint.keys():
+        for key in quantized_checkpoint:
            weight_map[key] = "model.safetensors"

        index_data = {
@ -245,7 +245,7 @@ class TestSingleRankSaveLoad(TestCase):
            sorted(original_tensors.keys()), sorted(state_dict_to_load.keys())
        )

-        for tensor_name in original_tensors.keys():
+        for tensor_name in original_tensors:
            original = original_tensors[tensor_name]
            loaded = state_dict_to_load[tensor_name]

--- a/test/distributed/checkpoint/test_quantized_hf_storage.py
+++ b/test/distributed/checkpoint/test_quantized_hf_storage.py
@ -15,6 +15,7 @@ from torch.testing._internal.common_utils import run_tests, TestCase

 class TestQuantizedHfStorage(TestCase):
    def setUp(self):
+        super().setUp()
        """Set up common test fixtures."""
        self.temp_dir = tempfile.TemporaryDirectory()
        self.path = self.temp_dir.name
--- a/test/distributed/checkpoint/test_state_dict.py
+++ b/test/distributed/checkpoint/test_state_dict.py
@ -886,7 +886,7 @@ class TestStateDict(DTensorTestBase, VerifyStateDictMixin):
            self.assertEqual(cpu_model_value, meta_model_value)

    @with_comms
-    @skip_if_lt_x_gpu(2)
+    @skip_if_lt_x_gpu(4)
    def test_setting_meta_device_model_broadcasting_and_memory(self) -> None:
        # This test verifies that we can set model state dict by a meta device model
        # With the correlated changes in state_dict, meta device model should be accepted
--- a/test/distributed/elastic/multiprocessing/test_api.py
+++ b/test/distributed/elastic/multiprocessing/test_api.py
@ -21,6 +21,7 @@ from torch.testing._internal.common_utils import run_tests, TestCase

 class SignalHandlingTest(TestCase):
    def setUp(self):
+        super().setUp()
        # Save original environment variable if it exists
        self.original_signals_env = os.environ.get(
            "TORCHELASTIC_SIGNALS_TO_HANDLE", None
--- a/test/distributed/fsdp/test_fsdp_misc.py
+++ b/test/distributed/fsdp/test_fsdp_misc.py
@ -479,6 +479,7 @@ class TestFSDPMiscMultiProcess(FSDPTest):
                    for (n, p), (n_prev, p_prev) in zip(
                        fsdp_overlap.named_parameters(), fsdp_overlap_prev_params
                    ):
+                        self.assertEqual(n, n_prev)
                        self.assertNotEqual(
                            p,
                            p_prev,
--- a/test/distributed/fsdp/test_fsdp_mixed_precision.py
+++ b/test/distributed/fsdp/test_fsdp_mixed_precision.py
@ -498,7 +498,7 @@ class TestFSDPMixedPrecision(FSDPTest):
                    for name, tensor in state_dict.items():
                        # Parameters and buffers are checkpointed in their
                        # original dtypes, which may be different.
-                        if name in named_buffers.keys():
+                        if name in named_buffers:
                            self.assertEqual(tensor.dtype, _BUFFER_ORIG_DTYPE)
                        else:
                            self.assertEqual(
--- a/test/distributed/launcher/test_api.py
+++ b/test/distributed/launcher/test_api.py
@ -16,6 +16,7 @@ from torch.testing._internal.common_utils import run_tests, TestCase

 class LauncherApiTest(TestCase):
    def setUp(self):
+        super().setUp()
        # Save original environment variable if it exists
        self.original_signals_env = os.environ.get(
            "TORCHELASTIC_SIGNALS_TO_HANDLE", None
--- a/test/distributed/pipelining/test_schedule.py
+++ b/test/distributed/pipelining/test_schedule.py
@ -21,6 +21,7 @@ from torch.distributed.pipelining import (
 from torch.distributed.pipelining._utils import generate_stage_to_rank_mapping
 from torch.distributed.pipelining.schedules import (
    _Action,
+    _add_reduce_grad,
    _add_send_recv,
    _add_unshard_reshard,
    _format_pipeline_order,
@ -574,6 +575,45 @@ class TestScheduleLowering(TestCase):
                ),
            )

+    @parametrize(
+        "test_info",
+        [
+            {
+                "compute": ["0F0", "0F1", "   ", "0B0", "0B1"],
+                "comms": ["0F0", "0F1", "0B0", "0B1", "0REDUCE_GRAD"],
+            },
+            {
+                "compute": ["0F0", "0F1", "1F0", "1F1", "1B0", "1B1", "0B0", "0B1"],
+                "comms": [
+                    "0F0",
+                    "0F1",
+                    "1F0",
+                    "1F1",
+                    "1B0",
+                    "1B1",
+                    "1REDUCE_GRAD",
+                    "0B0",
+                    "0B1",
+                    "0REDUCE_GRAD",
+                ],
+            },
+        ],
+    )
+    def test_reduce_grad(self, test_info):
+        compute_sch = self._parse_actions(test_info["compute"])
+        expected_comms_sch = self._parse_actions(test_info["comms"])
+
+        comms_sch = _add_reduce_grad(compute_sch, 2)
+        for expected, actual in zip(expected_comms_sch, comms_sch, strict=True):
+            self.assertEqual(
+                expected,
+                actual,
+                (
+                    f"Mismatch: expected action {expected} but found {actual}."
+                    f"\nWhole Schedule: {comms_sch}"
+                ),
+            )
+
    @parametrize(
        "test_info",
        [
--- a/Show More
+++ b/Show More