Enable dynamic shapes for CuTeDSL GroupGemm

[ghstack-poisoned]
allow sym_stride, and sym_size lowering in inductor to return ints (#167345 )
2025-11-13 05:05:17 +08:00 · 2025-11-11 09:37:43 -08:00 · 2025-11-10 23:29:23 +00:00 · 2025-11-10 23:25:22 +00:00 · 2025-11-10 23:09:01 +00:00 · 2025-11-10 22:51:03 +00:00
574 changed files with 14559 additions and 6839 deletions
--- a/.ci/docker/almalinux/build.sh
+++ b/.ci/docker/almalinux/build.sh
@ -36,11 +36,7 @@ case ${DOCKER_TAG_PREFIX} in
    ;;
  rocm*)
    BASE_TARGET=rocm
-    PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-    # add gfx950, gfx115x conditionally starting in ROCm 7.0
-    if [[ "$ROCM_VERSION" == *"7.0"* ]]; then
-        PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
-    fi
+    PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151"
    EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
    ;;
  *)
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -168,6 +168,18 @@ case "$tag" in
    VISION=yes
    TRITON=yes
    ;;
+  pytorch-linux-jammy-py3.11-clang12)
+    ANACONDA_PYTHON_VERSION=3.11
+    CLANG_VERSION=12
+    VISION=no
+    TRITON=no
+    ;;
+  pytorch-linux-jammy-py3.12-clang12)
+    ANACONDA_PYTHON_VERSION=3.12
+    CLANG_VERSION=12
+    VISION=no
+    TRITON=no
+    ;;
  pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-jammy-rocm-n-py3-benchmarks | pytorch-linux-noble-rocm-n-py3)
    if [[ $tag =~ "jammy" ]]; then
      ANACONDA_PYTHON_VERSION=3.10
@ -195,9 +207,9 @@ case "$tag" in
    NINJA_VERSION=1.9.0
    TRITON=yes
    ;;
-  pytorch-linux-jammy-xpu-n-py3 | pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks)
+  pytorch-linux-noble-xpu-n-py3 | pytorch-linux-noble-xpu-n-py3-inductor-benchmarks)
    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=11
+    GCC_VERSION=13
    VISION=yes
    XPU_VERSION=2025.2
    NINJA_VERSION=1.9.0
@ -248,6 +260,12 @@ case "$tag" in
    HALIDE=yes
    TRITON=yes
    ;;
+  pytorch-linux-jammy-cuda12.8-py3.12-pallas)
+    CUDA_VERSION=12.8.1
+    ANACONDA_PYTHON_VERSION=3.12
+    GCC_VERSION=11
+    PALLAS=yes
+    ;;
  pytorch-linux-jammy-py3.12-triton-cpu)
    CUDA_VERSION=12.6
    ANACONDA_PYTHON_VERSION=3.12
@ -369,6 +387,7 @@ docker build \
       --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
       --build-arg "EXECUTORCH=${EXECUTORCH}" \
       --build-arg "HALIDE=${HALIDE}" \
+       --build-arg "PALLAS=${PALLAS}" \
       --build-arg "XPU_VERSION=${XPU_VERSION}" \
       --build-arg "UNINSTALL_DILL=${UNINSTALL_DILL}" \
       --build-arg "ACL=${ACL:-}" \
--- a/.ci/docker/ci_commit_pins/jax.txt
+++ b/.ci/docker/ci_commit_pins/jax.txt
@ -0,0 +1 @@
+0.8.0
--- a/.ci/docker/common/install_jax.sh
+++ b/.ci/docker/common/install_jax.sh
@ -0,0 +1,40 @@
+#!/bin/bash
+
+set -ex
+
+source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
+
+# Get the pinned JAX version (same for all CUDA versions)
+JAX_VERSION=$(get_pinned_commit /ci_commit_pins/jax)
+
+function install_jax_12() {
+  echo "Installing JAX ${JAX_VERSION} with CUDA 12 support"
+  pip_install "jax[cuda12]==${JAX_VERSION}" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+
+  # Verify installation
+  python -c "import jax"  # check for errors
+  echo "JAX ${JAX_VERSION} installation completed successfully for CUDA 12"
+}
+
+function install_jax_13() {
+  echo "Installing JAX ${JAX_VERSION} with CUDA 13 support"
+  pip_install "jax[cuda13]==${JAX_VERSION}" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+
+  # Verify installation
+  python -c "import jax"  # check for errors
+  echo "JAX ${JAX_VERSION} installation completed successfully for CUDA 13"
+}
+
+# idiomatic parameter and option handling in sh
+while test $# -gt 0
+do
+    case "$1" in
+    12.4|12.6|12.6.*|12.8|12.8.*|12.9|12.9.*) install_jax_12;
+        ;;
+    13.0|13.0.*) install_jax_13;
+        ;;
+    *) echo "bad argument $1"; exit 1
+        ;;
+    esac
+    shift
+done
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -9,7 +9,7 @@ set -xe

 function install_ubuntu() {
    . /etc/os-release
-    if [[ ! " jammy " =~ " ${VERSION_CODENAME} " ]]; then
+    if [[ ! " jammy noble " =~ " ${VERSION_CODENAME} " ]]; then
        echo "Ubuntu version ${VERSION_CODENAME} not supported"
        exit
    fi
@ -35,25 +35,24 @@ function install_ubuntu() {
    # The xpu-smi packages
    apt-get install -y flex bison xpu-smi

-    if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
-        # Compute and Media Runtimes
+    # Compute and Media Runtimes
+    if [[ " ${VERSION_CODENAME} " =~ " noble " ]]; then
        apt-get install -y \
-            intel-opencl-icd intel-level-zero-gpu level-zero \
-            intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \
-            libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
+            intel-opencl-icd libze-intel-gpu1 libze1 \
+            intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
+            libegl-mesa0 libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
            libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
-            mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
-        # Development Packages
-        apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
-    else # rolling driver
+            mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc
+    else # jammy
        apt-get install -y \
            intel-opencl-icd libze-intel-gpu1 libze1 \
            intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
            libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
            libglapi-mesa libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
            mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc
-        apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev
    fi
+    # Development Packages
+    apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev

    # Install Intel Support Packages
    apt-get install -y ${XPU_PACKAGES}
@ -66,7 +65,7 @@ function install_ubuntu() {
 function install_rhel() {
    . /etc/os-release
    if [[ "${ID}" == "rhel" ]]; then
-        if [[ ! " 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
+        if [[ ! " 8.8 8.10 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
            echo "RHEL version ${VERSION_ID} not supported"
            exit
        fi
@ -147,7 +146,7 @@ function install_sles() {
 XPU_DRIVER_VERSION=""
 if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
    # Use GPU driver LTS releases
-    XPU_DRIVER_VERSION="/lts/2350"
+    XPU_DRIVER_VERSION="/lts/2523"
 fi

 # Default use Intel® oneAPI Deep Learning Essentials 2025.1
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -49,11 +49,7 @@ case ${DOCKER_TAG_PREFIX} in
        fi
        BASE_TARGET=rocm
        GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
-        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-        # add gfx950, gfx115x conditionally starting in ROCm 7.0
-        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
-            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
-        fi
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151"
        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
        ;;
    *)
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -87,11 +87,7 @@ case ${image} in
        MANY_LINUX_VERSION="2_28"
        DEVTOOLSET_VERSION="11"
        GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
-        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-        # add gfx950, gfx115x conditionally starting in ROCm 7.0
-        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
-            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
-        fi
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151"
        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
        ;;
    manylinux2_28-builder:xpu)
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -143,6 +143,15 @@ COPY ci_commit_pins/halide.txt halide.txt
 RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
 RUN rm install_halide.sh common_utils.sh halide.txt

+ARG PALLAS
+ARG CUDA_VERSION
+# Install JAX with CUDA support (for Pallas)
+COPY ./common/install_jax.sh install_jax.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./ci_commit_pins/jax.txt /ci_commit_pins/jax.txt
+RUN if [ -n "${PALLAS}" ]; then bash ./install_jax.sh ${CUDA_VERSION}; fi
+RUN rm -f install_jax.sh common_utils.sh /ci_commit_pins/jax.txt
+
 ARG ONNX
 # Install ONNX dependencies
 COPY ./common/install_onnx.sh ./common/common_utils.sh ./
--- a/.ci/lumen_cli/cli/lib/common/cli_helper.py
+++ b/.ci/lumen_cli/cli/lib/common/cli_helper.py
@ -8,9 +8,11 @@ from abc import ABC, abstractmethod


 try:
-    from typing import Any, Callable, Required, TypedDict  # Python 3.11+
+    from collections.abc import Callable  # Python 3.11+
+    from typing import Any, Required, TypedDict
 except ImportError:
-    from typing import Any, Callable, TypedDict
+    from collections.abc import Callable
+    from typing import Any, TypedDict

    from typing_extensions import Required  # Fallback for Python <3.11

--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -168,14 +168,16 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/compiler/latest/env/vars.sh
  # shellcheck disable=SC1091
+  source /opt/intel/oneapi/umf/latest/env/vars.sh
+  # shellcheck disable=SC1091
  source /opt/intel/oneapi/ccl/latest/env/vars.sh
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/mpi/latest/env/vars.sh
+  # shellcheck disable=SC1091
+  source /opt/intel/oneapi/pti/latest/env/vars.sh
  # Enable XCCL build
  export USE_XCCL=1
  export USE_MPI=0
-  # XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA
-  export USE_KINETO=0
  export TORCH_XPU_ARCH_LIST=pvc
 fi

--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -208,6 +208,8 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  source /opt/intel/oneapi/ccl/latest/env/vars.sh
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/mpi/latest/env/vars.sh
+  # shellcheck disable=SC1091
+  source /opt/intel/oneapi/pti/latest/env/vars.sh
  # Check XPU status before testing
  timeout 30 xpu-smi discovery || true
 fi
@ -824,6 +826,11 @@ test_inductor_halide() {
  assert_git_not_dirty
 }

+test_inductor_pallas() {
+  python test/run_test.py --include inductor/test_pallas.py --verbose
+  assert_git_not_dirty
+}
+
 test_inductor_triton_cpu() {
  python test/run_test.py --include inductor/test_triton_cpu_backend.py inductor/test_torchinductor_strided_blocks.py --verbose
  assert_git_not_dirty
@ -1724,6 +1731,8 @@ elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
  test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
  test_inductor_halide
+elif [[ "${TEST_CONFIG}" == *inductor-pallas* ]]; then
+  test_inductor_pallas
 elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then
  test_inductor_triton_cpu
 elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@ -1 +1 @@
-cfbc5c2f1c798991715a6b06bb3ce46478c4487c
+ccb801b88af136454798b945175c4c87e636ac33
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-c8b09f5f77d6bf6fb7ed7a9aa83e5d8156b3a5e9
+e4d25697f9dc5eedaf8f0a5bf085c62c5455a53a
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -138,7 +138,8 @@
 - test/test_matmul_cuda.py
 - test/test_scaled_matmul_cuda.py
 - test/inductor/test_fp8.py
- aten/src/ATen/native/cuda/Blas.cpp
+- aten/src/ATen/native/cuda/*Blas.cpp
+- aten/src/ATen/cuda/CUDA*Blas.*
 - torch/**/*cublas*
 - torch/_inductor/kernel/mm.py
 - test/inductor/test_max_autotune.py
@ -148,7 +149,8 @@
 - test/test_matmul_cuda.py
 - test/test_scaled_matmul_cuda.py
 - test/inductor/test_fp8.py
- aten/src/ATen/native/cuda/Blas.cpp
+- aten/src/ATen/native/cuda/*Blas.cpp
+- aten/src/ATen/cuda/CUDA*Blas.*
 - torch/**/*cublas*
 - torch/_inductor/kernel/mm.py
 - test/inductor/test_max_autotune.py
@ -158,7 +160,21 @@
 - test/test_matmul_cuda.py
 - test/test_scaled_matmul_cuda.py
 - test/inductor/test_fp8.py
- aten/src/ATen/native/cuda/Blas.cpp
+- aten/src/ATen/native/cuda/*Blas.cpp
+- aten/src/ATen/cuda/CUDA*Blas.*
 - torch/_inductor/kernel/mm.py
 - test/inductor/test_max_autotune.py
 - third_party/fbgemm
+
+"ciflow/mps":
+- aten/src/ATen/mps/**
+- aten/src/ATen/native/mps/**
+- torch/_inductor/codegen/mps.py
+- test/test_mps.py
+- test/inductor/test_mps_basic.py
+
+"ciflow/h100-symm-mem":
+- torch/csrc/distributed/c10d/symm_mem/**
+- torch/distributed/_symmetric_memory/**
+- test/distributed/**/*mem*
+- test/distributed/**/*mem*/**
--- a/.github/nitpicks.yml
+++ b/.github/nitpicks.yml
@ -10,3 +10,4 @@
  pathFilter:
    - 'torch/csrc/inductor/aoti_torch/c/*'
    - 'torch/csrc/inductor/aoti_torch/generated/*'
+    - 'torch/csrc/stable/c/*'
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -2,8 +2,8 @@ tracking_issue: 24422
 ciflow_tracking_issue: 64124
 ciflow_push_tags:
 - ciflow/b200
- ciflow/b200-symm-mem
 - ciflow/b200-distributed
+- ciflow/b200-symm-mem
 - ciflow/binaries
 - ciflow/binaries_libtorch
 - ciflow/binaries_wheel
@ -22,6 +22,8 @@ ciflow_push_tags:
 - ciflow/inductor-perf-test-nightly-xpu
 - ciflow/inductor-periodic
 - ciflow/inductor-rocm
+- ciflow/inductor-rocm-mi200
+- ciflow/inductor-rocm-mi300
 - ciflow/linux-aarch64
 - ciflow/mps
 - ciflow/nightly
@ -33,11 +35,13 @@ ciflow_push_tags:
 - ciflow/quantization-periodic
 - ciflow/riscv64
 - ciflow/rocm
+- ciflow/rocm-mi200
 - ciflow/rocm-mi300
 - ciflow/rocm-mi355
 - ciflow/rocm-navi31
 - ciflow/s390
 - ciflow/slow
+- ciflow/slow-rocm-mi200
 - ciflow/torchbench
 - ciflow/triton_binaries
 - ciflow/trunk
--- a/.github/scripts/delete_old_branches.py
+++ b/.github/scripts/delete_old_branches.py
@ -1,10 +1,11 @@
 # Delete old branches
 import os
 import re
+from collections.abc import Callable
 from datetime import datetime
 from functools import lru_cache
 from pathlib import Path
-from typing import Any, Callable
+from typing import Any

 from github_utils import gh_fetch_json_dict, gh_graphql
 from gitutils import GitRepo
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@ -8,10 +8,11 @@ import re
 import subprocess
 import sys
 import warnings
+from collections.abc import Callable
 from enum import Enum
 from functools import cache
 from logging import info
-from typing import Any, Callable, Optional
+from typing import Any, Optional
 from urllib.request import Request, urlopen

 import yaml
--- a/.github/scripts/get_workflow_job_id.py
+++ b/.github/scripts/get_workflow_job_id.py
@ -11,7 +11,8 @@ import sys
 import time
 import urllib
 import urllib.parse
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from typing import Any, Optional
 from urllib.request import Request, urlopen


--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@ -3,8 +3,9 @@
 import json
 import os
 import warnings
+from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Any, Callable, cast, Optional, Union
+from typing import Any, cast, Optional, Union
 from urllib.error import HTTPError
 from urllib.parse import quote
 from urllib.request import Request, urlopen
--- a/.github/scripts/gitutils.py
+++ b/.github/scripts/gitutils.py
@ -4,10 +4,10 @@ import os
 import re
 import tempfile
 from collections import defaultdict
-from collections.abc import Iterator
+from collections.abc import Callable, Iterator
 from datetime import datetime
 from functools import wraps
-from typing import Any, Callable, cast, Optional, TypeVar, Union
+from typing import Any, cast, Optional, TypeVar, Union


 T = TypeVar("T")
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -17,12 +17,12 @@ import re
 import time
 import urllib.parse
 from collections import defaultdict
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from dataclasses import dataclass
 from functools import cache
 from pathlib import Path
 from re import Pattern
-from typing import Any, Callable, cast, NamedTuple, Optional
+from typing import Any, cast, NamedTuple, Optional
 from warnings import warn

 import yaml
--- a/.github/workflows/b200-distributed.yml
+++ b/.github/workflows/b200-distributed.yml
@ -37,7 +37,6 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: linux.12xlarge.memory
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed-b200
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '10.0'
--- a/.github/workflows/b200-symm-mem.yml
+++ b/.github/workflows/b200-symm-mem.yml
@ -37,7 +37,6 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: linux.12xlarge.memory
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '10.0'
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -56,6 +56,8 @@ jobs:
          pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9,
          pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
          pytorch-linux-jammy-py3.10-clang12,
+          pytorch-linux-jammy-py3.11-clang12,
+          pytorch-linux-jammy-py3.12-clang12,
          pytorch-linux-jammy-py3.13-clang12,
          pytorch-linux-jammy-py3.14-clang12,
          pytorch-linux-jammy-rocm-n-py3,
@ -65,9 +67,10 @@ jobs:
          pytorch-linux-jammy-py3.10-gcc11,
          pytorch-linux-jammy-py3-gcc11-inductor-benchmarks,
          pytorch-linux-jammy-py3.12-halide,
+          pytorch-linux-jammy-cuda12.8-py3.12-pallas,
          pytorch-linux-jammy-xpu-n-1-py3,
-          pytorch-linux-jammy-xpu-n-py3,
-          pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks,
+          pytorch-linux-noble-xpu-n-py3,
+          pytorch-linux-noble-xpu-n-py3-inductor-benchmarks,
          pytorch-linux-jammy-py3-clang18-asan,
          pytorch-linux-jammy-py3-clang12-onnx,
          pytorch-linux-jammy-linter,
--- a/.github/workflows/h100-distributed.yml
+++ b/.github/workflows/h100-distributed.yml
@ -37,7 +37,6 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: "linux.c7i.12xlarge"
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-dist
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '9.0'
--- a/.github/workflows/inductor-perf-test-nightly-xpu.yml
+++ b/.github/workflows/inductor-perf-test-nightly-xpu.yml
@ -83,8 +83,8 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-xpu-n-py3.10
-      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks
+      build-environment: linux-noble-xpu-n-py3.10
+      docker-image-name: ci-image:pytorch-linux-noble-xpu-n-py3-inductor-benchmarks
      runner: linux.c7i.12xlarge
      test-matrix: |
        { include: [
@ -117,7 +117,7 @@ jobs:
    uses: ./.github/workflows/_xpu-test.yml
    needs: xpu-n-py3_10-inductor-benchmark-build
    with:
-      build-environment: linux-jammy-xpu-n-py3.10
+      build-environment: linux-noble-xpu-n-py3.10
      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-false-cppwrapper-true-aotinductor-true-freezing_cudagraphs-false-cudagraphs_low_precision-false
      docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
      test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
@ -137,7 +137,7 @@ jobs:
    uses: ./.github/workflows/_xpu-test.yml
    needs: xpu-n-py3_10-inductor-benchmark-build
    with:
-      build-environment: linux-jammy-xpu-n-py3.10
+      build-environment: linux-noble-xpu-n-py3.10
      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
      docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
      test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
--- a/.github/workflows/inductor-rocm-mi200.yml
+++ b/.github/workflows/inductor-rocm-mi200.yml
@ -7,7 +7,7 @@ on:
    branches:
      - release/*
    tags:
-      - ciflow/inductor-rocm/*
+      - ciflow/inductor-rocm-mi200/*
  workflow_dispatch:

 concurrency:
--- a/.github/workflows/inductor-rocm-mi300.yml
+++ b/.github/workflows/inductor-rocm-mi300.yml
@ -7,6 +7,7 @@ on:
      - release/*
    tags:
      - ciflow/inductor-rocm/*
+      - ciflow/inductor-rocm-mi300/*
  workflow_dispatch:

 concurrency:
--- a/.github/workflows/inductor-unittest.yml
+++ b/.github/workflows/inductor-unittest.yml
@ -81,6 +81,32 @@ jobs:
      test-matrix: ${{ needs.inductor-halide-build.outputs.test-matrix }}
    secrets: inherit

+  inductor-pallas-build:
+    name: inductor-pallas-build
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.12-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-py3.12-pallas
+      cuda-arch-list: '8.9'
+      runner: linux.8xlarge.memory
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      test-matrix: |
+        { include: [
+          { config: "inductor-pallas", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  inductor-pallas-test:
+    name: inductor-pallas-test
+    uses: ./.github/workflows/_linux-test.yml
+    needs: inductor-pallas-build
+    with:
+      build-environment: linux-jammy-py3.12-gcc11
+      docker-image: ${{ needs.inductor-pallas-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-pallas-build.outputs.test-matrix }}
+    secrets: inherit
+
  inductor-triton-cpu-build:
    name: inductor-triton-cpu-build
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/periodic-rocm-mi200.yml
+++ b/.github/workflows/periodic-rocm-mi200.yml
@ -11,7 +11,6 @@ on:
    - cron: 29 8 * * *  # about 1:29am PDT, for mem leak check and rerun disabled tests
  push:
    tags:
-      - ciflow/periodic/*
      - ciflow/periodic-rocm-mi200/*
    branches:
      - release/*
--- a/.github/workflows/periodic-rocm-mi300.yml
+++ b/.github/workflows/periodic-rocm-mi300.yml
@ -11,6 +11,7 @@ on:
    - cron: 29 8 * * *  # about 1:29am PDT, for mem leak check and rerun disabled tests
  push:
    tags:
+      - ciflow/periodic/*
      - ciflow/periodic-rocm-mi300/*
    branches:
      - release/*
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -342,16 +342,16 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc9-inductor-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-xpu-n-py3_10-build:
-    name: linux-jammy-xpu-n-py3.10
+  linux-noble-xpu-n-py3_10-build:
+    name: linux-noble-xpu-n-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      # This should sync with the build in xpu.yml but xpu uses a larger runner
      # sync-tag: linux-xpu-n-build
      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
-      build-environment: linux-jammy-xpu-n-py3.10
-      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
+      build-environment: linux-noble-xpu-n-py3.10
+      docker-image-name: ci-image:pytorch-linux-noble-xpu-n-py3
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 4, runner: "linux.idc.xpu" },
--- a/.github/workflows/rocm-mi200.yml
+++ b/.github/workflows/rocm-mi200.yml
@ -5,7 +5,7 @@ on:
    branches:
      - release/*
    tags:
-      - ciflow/rocm/*
+      - ciflow/rocm-mi200/*
  workflow_dispatch:
  schedule:
    - cron: 29 8 * * *  # about 1:29am PDT
--- a/.github/workflows/rocm-mi300.yml
+++ b/.github/workflows/rocm-mi300.yml
@ -6,6 +6,7 @@ on:
      - main
      - release/*
    tags:
+      - ciflow/rocm/*
      - ciflow/rocm-mi300/*
  workflow_dispatch:
  schedule:
--- a/.github/workflows/slow-rocm-mi200.yml
+++ b/.github/workflows/slow-rocm-mi200.yml
@ -0,0 +1,81 @@
+# This workflow is dedicated to host slow jobs that are run only periodically because
+# they are too slow to run in every commit.  The list of slow tests can be found in
+# https://github.com/pytorch/test-infra/blob/generated-stats/stats/slow-tests.json
+name: slow-rocm-mi200
+
+on:
+  push:
+    branches:
+      - release/*
+    tags:
+      - ciflow/slow/*
+      - ciflow/slow-rocm-mi200/*
+  schedule:
+    - cron: 0 */3 * * *
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  llm-td:
+    if: github.repository_owner == 'pytorch'
+    name: before-test
+    uses: ./.github/workflows/llm_td_retrieval.yml
+    permissions:
+      id-token: write
+      contents: read
+
+  target-determination:
+    name: before-test
+    uses: ./.github/workflows/target_determination.yml
+    needs: llm-td
+    permissions:
+      id-token: write
+      contents: read
+
+  get-label-type:
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
+  linux-jammy-rocm-py3_10-build:
+    name: linux-jammy-rocm-py3.10
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
+      sync-tag: rocm-build
+      test-matrix: |
+        { include: [
+          { config: "slow", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
+          { config: "slow", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
+        ]}
+    secrets: inherit
+
+  linux-jammy-rocm-py3_10-test:
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-jammy-rocm-py3.10
+    uses: ./.github/workflows/_rocm-test.yml
+    needs:
+      - linux-jammy-rocm-py3_10-build
+      - target-determination
+    with:
+      build-environment: linux-jammy-rocm-py3.10
+      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
+    secrets: inherit
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -105,36 +105,6 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-py3_10-clang12-build.outputs.test-matrix }}
    secrets: inherit

-  linux-jammy-rocm-py3_10-build:
-    name: linux-jammy-rocm-py3.10
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-rocm-py3.10
-      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
-      test-matrix: |
-        { include: [
-          { config: "slow", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
-          { config: "slow", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2", owners: ["module:rocm"] },
-        ]}
-    secrets: inherit
-
-  linux-jammy-rocm-py3_10-test:
-    permissions:
-      id-token: write
-      contents: read
-    name: linux-jammy-rocm-py3.10
-    uses: ./.github/workflows/_rocm-test.yml
-    needs:
-      - linux-jammy-rocm-py3_10-build
-      - target-determination
-    with:
-      build-environment: linux-jammy-rocm-py3.10
-      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
-    secrets: inherit
-
  linux-jammy-py3_10-clang18-asan-build:
    name: linux-jammy-py3.10-clang18-asan
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/test-b200.yml
+++ b/.github/workflows/test-b200.yml
@ -52,7 +52,6 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: linux.12xlarge.memory
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '10.0'
@ -73,4 +72,4 @@ jobs:
      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.test-matrix }}
      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-    secrets: inherit
+    secrets: inherit
--- a/.github/workflows/test-h100.yml
+++ b/.github/workflows/test-h100.yml
@ -41,7 +41,6 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: linux.12xlarge.memory
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '9.0'
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@ -11,15 +11,16 @@ on:
      - inductor
      - unstable
      - slow
+      - slow-rocm-mi200
      - unstable-periodic
      - inductor-periodic
-      - rocm
+      - rocm-mi200
      - rocm-mi300
      - rocm-mi355
      - inductor-micro-benchmark
      - inductor-micro-benchmark-x86
      - inductor-cu124
-      - inductor-rocm
+      - inductor-rocm-mi200
      - inductor-rocm-mi300
      - mac-mps
      - linux-aarch64
--- a/.github/workflows/xpu.yml
+++ b/.github/workflows/xpu.yml
@ -47,15 +47,15 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-xpu-n-py3_10-build:
-    name: linux-jammy-xpu-n-py3.10
+  linux-noble-xpu-n-py3_10-build:
+    name: linux-noble-xpu-n-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      sync-tag: linux-xpu-n-build
      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
-      build-environment: linux-jammy-xpu-n-py3.10
-      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
+      build-environment: linux-noble-xpu-n-py3.10
+      docker-image-name: ci-image:pytorch-linux-noble-xpu-n-py3
      runner: linux.c7i.12xlarge
      test-matrix: |
        { include: [
@ -74,17 +74,17 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-xpu-n-py3_10-test:
-    name: linux-jammy-xpu-n-py3.10
+  linux-noble-xpu-n-py3_10-test:
+    name: linux-noble-xpu-n-py3.10
    uses: ./.github/workflows/_xpu-test.yml
-    needs: linux-jammy-xpu-n-py3_10-build
+    needs: linux-noble-xpu-n-py3_10-build
    permissions:
      id-token: write
      contents: read
    with:
-      build-environment: linux-jammy-xpu-n-py3.10
-      docker-image: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-xpu-n-py3_10-build.outputs.test-matrix }}
+      build-environment: linux-noble-xpu-n-py3.10
+      docker-image: ${{ needs.linux-noble-xpu-n-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-noble-xpu-n-py3_10-build.outputs.test-matrix }}
    secrets: inherit

  windows-xpu-n-1-build:
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -143,7 +143,8 @@ init_command = [
    'tools/linter/adapters/pip_init.py',
    '--dry-run={{DRYRUN}}',
    'numpy==1.26.4 ; python_version >= "3.10" and python_version <= "3.11"',
-    'numpy==2.1.0 ; python_version >= "3.12"',
+    'numpy==2.1.0 ; python_version >= "3.12" and python_version <= "3.13"',
+    'numpy==2.3.4 ; python_version >= "3.14"',
    'expecttest==0.3.0',
    'pyrefly==0.36.2',
    'sympy==1.13.3',
@ -1401,7 +1402,7 @@ init_command = [
    '--dry-run={{DRYRUN}}',
    'usort==1.0.8.post1',
    'isort==6.0.1',
-    'ruff==0.13.1',  # sync with RUFF
+    'ruff==0.14.4',  # sync with RUFF
 ]
 is_formatter = true

@ -1536,7 +1537,7 @@ init_command = [
    'python3',
    'tools/linter/adapters/pip_init.py',
    '--dry-run={{DRYRUN}}',
-    'ruff==0.13.1',  # sync with PYFMT
+    'ruff==0.14.4',  # sync with PYFMT
 ]
 is_formatter = true

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -736,6 +736,44 @@ if(NOT DEFINED USE_BLAS)
  set(USE_BLAS ON)
 endif()

+# Prioritized Text Linker Optimization
+if(USE_PRIORITIZED_TEXT_FOR_LD)
+
+  set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt")
+  set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld")
+
+  execute_process(
+    COMMAND ${Python_EXECUTABLE}
+            ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py
+            --filein "${LINKER_SCRIPT_FILE_IN}"
+            --fout  "${LINKER_SCRIPT_FILE_OUT}"
+    RESULT_VARIABLE _gen_result
+    OUTPUT_VARIABLE _gen_output
+    ERROR_VARIABLE  _gen_error
+  )
+
+  if(NOT _gen_result EQUAL 0)
+    message(FATAL_ERROR
+      "Failed to generate linker script:\n${_gen_output}\n${_gen_error}")
+  endif()
+
+  append_cxx_flag_if_supported("-ffunction-sections" CMAKE_CXX_FLAGS)
+  append_cxx_flag_if_supported("-fdata-sections" CMAKE_CXX_FLAGS)
+  append_c_flag_if_supported("-ffunction-sections" CMAKE_C_FLAGS)
+  append_c_flag_if_supported("-fdata-sections" CMAKE_C_FLAGS)
+
+  set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -T${LINKER_SCRIPT_FILE_OUT}")
+  set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -T${LINKER_SCRIPT_FILE_OUT}")
+
+else()
+  if(LINUX AND CPU_AARCH64)
+    message(WARNING [[
+    It is strongly recommend to enable linker script optimization for all AArch64 Linux builds.
+    To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
+    ]])
+  endif()
+endif()
+
 # Build libtorch mobile library, which contains ATen/TH ops and native support
 # for TorchScript model, but doesn't contain not-yet-unified caffe2 ops;
 if(INTERN_BUILD_MOBILE)
@ -1402,9 +1440,6 @@ if(BUILD_JNI)
  add_subdirectory(android/pytorch_android)
 endif()

-include(cmake/Summary.cmake)
-caffe2_print_configuration_summary()
-
 # Parse custom debug info
 if(DEFINED USE_CUSTOM_DEBINFO)
  string(REPLACE ";" " " SOURCE_FILES "${USE_CUSTOM_DEBINFO}")
@ -1444,56 +1479,5 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA)
          DESTINATION "${CMAKE_INSTALL_BINDIR}")
 endif()

-if(USE_PRIORITIZED_TEXT_FOR_LD)
-  add_compile_options(
-    $<$<COMPILE_LANGUAGE:C,CXX>:-ffunction-sections>
-    $<$<COMPILE_LANGUAGE:C,CXX>:-fdata-sections>
-  )
-  set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld")
-  set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt")
-
-  add_custom_command(
-    OUTPUT "${LINKER_SCRIPT_FILE_OUT}"
-    COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}"
-    DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}"
-    COMMENT "Generating prioritized text linker files"
-    VERBATIM
-  )
-
-  add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
-
-  if(BUILD_PYTHON)
-    set(LINKER_OPT_TARGETS torch_python)
-  endif()
-
-  if(NOT BUILD_LIBTORCHLESS)
-    list(APPEND LINKER_OPT_TARGETS torch_cpu c10)
-    if(USE_CUDA)
-      list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda)
-    endif()
-    if(USE_XPU)
-      list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu)
-    endif()
-    if(USE_ROCM)
-      list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip)
-    endif()
-  endif()
-
-  foreach(tgt IN LISTS LINKER_OPT_TARGETS)
-    if(TARGET ${tgt})
-      add_dependencies("${tgt}" generate_linker_script)
-      target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}")
-      set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
-    else()
-       message(WARNING "Requested target '${tgt}' for linker script optimization was not found.")
-    endif()
-  endforeach()
-
-else()
-  if(LINUX AND CPU_AARCH64)
-    message(WARNING [[
-    It is strongly recommend to enable linker script optimization for all AArch64 Linux builds.
-    To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
-    ]])
-  endif()
-endif()
+include(cmake/Summary.cmake)
+caffe2_print_configuration_summary()
--- a/6
+++ b/6
@ -210,8 +210,12 @@ torch/backends/cudnn/ @eqy @syed-ahmed @Aidyn-A
 /test/inductor/test_flex_attention.py @drisspg
 /test/inductor/test_flex_decoding.py @drisspg

-# Low Precision GEMMs
+# Low Precision & Grouped GEMMs
 /aten/src/ATen/native/cuda/Blas.cpp @drisspg @slayton58
+/aten/src/ATen/native/cuda/GroupedBlas.cpp @drisspg @slayton58
+/aten/src/ATen/native/cuda/ScaledBlas.cpp @drisspg @slayton58
 /aten/src/ATen/cuda/CUDABlas.cpp @drisspg @slayton58
 /aten/src/ATen/cuda/CUDABlas.h @drisspg @slayton58
+/aten/src/ATen/cuda/CUDAScaledBlas.cpp @drisspg @slayton58
+/aten/src/ATen/cuda/CUDAScaledBlas.h @drisspg @slayton58
 /test/test_scaled_matmul_cuda.py @drisspg @slayton58
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -174,6 +174,12 @@ class TORCH_API Context {
  static long versionCuDNN() {
    return detail::getCUDAHooks().versionCuDNN();
  }
+  static long versionRuntimeCuDNN() {
+    return detail::getCUDAHooks().versionRuntimeCuDNN();
+  }
+  static long versionCuDNNFrontend() {
+    return detail::getCUDAHooks().versionCuDNNFrontend();
+  }
  static bool hasCuSOLVER() {
    return detail::getCUDAHooks().hasCuSOLVER();
  }
--- a/aten/src/ATen/DeviceAccelerator.h
+++ b/aten/src/ATen/DeviceAccelerator.h
@ -94,6 +94,11 @@ TORCH_API inline void resetPeakStats(c10::DeviceIndex device_index) {
  at::getDeviceAllocator(device_type)->resetPeakStats(device_index);
 }

+TORCH_API inline std::pair<size_t, size_t> getMemoryInfo(
+    c10::DeviceIndex device_index) {
+  const auto device_type = getAccelerator(true).value();
+  return at::getDeviceAllocator(device_type)->getMemoryInfo(device_index);
+}
 } // namespace at::accelerator

 namespace at {
--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@ -6,6 +6,7 @@
 #include <c10/util/Half.h>
 #include <c10/util/Metaprogramming.h>
 #include <c10/util/complex.h>
+#include <torch/headeronly/core/Dispatch.h>

 #ifdef __CUDACC__
 #include <cuda.h> // For CUDA_VERSION
@ -61,12 +62,9 @@ TORCH_API void record_kernel_function_dtype(std::string name);
    }                                                 \
  } while (0)

-#define AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, HINT, ...)                 \
-  case enum_type: {                                                           \
-    AT_PRIVATE_CHECK_SELECTIVE_BUILD(enum_type);                              \
-    using HINT [[maybe_unused]] = c10::impl::ScalarTypeToCPPTypeT<enum_type>; \
-    return __VA_ARGS__();                                                     \
-  }
+#define AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, HINT, ...) \
+  THO_PRIVATE_CASE_TYPE_USING_HINT_TMPL(                      \
+      AT_PRIVATE_CHECK_SELECTIVE_BUILD, enum_type, HINT, __VA_ARGS__)

 #define AT_DISPATCH_CASE(enum_type, ...) \
  AT_PRIVATE_CASE_TYPE_USING_HINT(enum_type, scalar_t, __VA_ARGS__)
@ -95,14 +93,6 @@ TORCH_API void record_kernel_function_dtype(std::string name);
    return __VA_ARGS__();                                                   \
  }

-namespace detail {
-
-inline at::ScalarType scalar_type(at::ScalarType s) {
-  return s;
-}
-
-} // namespace detail
-
 // The AT_DISPATCH_* family of macros provides the ability to
 // conveniently generate specializations of a kernel over all of the
 // dtypes we care about in PyTorch.  We call it "dispatch" because
@ -190,27 +180,13 @@ inline at::ScalarType scalar_type(at::ScalarType s) {
 // but we're just being safe (and it doesn't hurt.)  Note we must
 // use it to shut up warnings about unused store.

-#define AT_DISPATCH_SWITCH(TYPE, NAME, ...)                                 \
-  [&] {                                                                     \
-    const auto& the_type = TYPE;                                            \
-    constexpr const char* at_dispatch_name = NAME;                          \
-    /* don't use TYPE again in case it is an expensive or side-effect op */ \
-    at::ScalarType _st = ::detail::scalar_type(the_type);                   \
-    RECORD_KERNEL_FUNCTION_DTYPE(at_dispatch_name, _st);                    \
-    C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wswitch-enum")             \
-    switch (_st) {                                                          \
-      __VA_ARGS__                                                           \
-      default:                                                              \
-        TORCH_CHECK_NOT_IMPLEMENTED(                                        \
-            false,                                                          \
-            '"',                                                            \
-            at_dispatch_name,                                               \
-            "\" not implemented for '",                                     \
-            toString(_st),                                                  \
-            "'");                                                           \
-    }                                                                       \
-    C10_DIAGNOSTIC_POP()                                                    \
-  }()
+#define AT_DISPATCH_SWITCH(TYPE, NAME, ...) \
+  THO_DISPATCH_SWITCH_TMPL(                 \
+      RECORD_KERNEL_FUNCTION_DTYPE,         \
+      TORCH_CHECK_NOT_IMPLEMENTED,          \
+      TYPE,                                 \
+      NAME,                                 \
+      __VA_ARGS__)

 #define AT_DISPATCH_CASE_FLOATING_TYPES(...)            \
  AT_DISPATCH_CASE(at::ScalarType::Double, __VA_ARGS__) \
--- a/aten/src/ATen/Dispatch_v2.h
+++ b/aten/src/ATen/Dispatch_v2.h
@ -1,3 +1,8 @@
+#pragma once
+
+#include <torch/headeronly/core/Dispatch_v2.h>
+
+// Get AT_DISPATCH_SWITCH and AT_DISPATCH_CASE:
 #include <ATen/Dispatch.h>

 // This is a new implementation of the AT_DISPATCH macro family from
@ -74,41 +79,19 @@
 // macro expansion occurs, mediated with AT_EXPAND and AT_GUARD.  I mostly
 // relied on GPT4 to help me get it right.

-// Public API macros
-
 // See documentation above
 #define AT_DISPATCH_V2(TYPE, NAME, BODY, ...) \
-  AT_DISPATCH_SWITCH(TYPE, NAME, AT_AP_VAR(AT_WRAP(BODY), TYPE, __VA_ARGS__))
-
-// This macro lets you pass an arbitrary expression that may contain internal
-// commas to another macro without having the commas causing the expression
-// to be interpreted as being multiple arguments
-#define AT_WRAP(...) __VA_ARGS__
-
-#define AT_FLOAT8_TYPES                                          \
-  c10::kFloat8_e5m2, c10::kFloat8_e5m2fnuz, c10::kFloat8_e4m3fn, \
-      c10::kFloat8_e4m3fnuz, c10::kFloat8_e8m0fnu
-
-#define AT_INTEGRAL_TYPES \
-  c10::kByte, c10::kChar, c10::kInt, c10::kLong, c10::kShort
-#define AT_FLOATING_TYPES c10::kDouble, c10::kFloat
-#define AT_BAREBONES_UNSIGNED_TYPES c10::kUInt16, c10::kUInt32, c10::kUInt64
-#define AT_INTEGRAL_TYPES_V2 \
-  AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)
-#define AT_COMPLEX_TYPES c10::kComplexDouble, c10::kComplexFloat
-#define AT_QINT_TYPES c10::kQInt8, c10::kQUInt8, c10::kQInt32
-// NB: not *actually* all types
-#define AT_ALL_TYPES AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_FLOATING_TYPES)
-#define AT_ALL_TYPES_AND_COMPLEX \
-  AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_COMPLEX_TYPES)
-
-// Helper macros
+  THO_DISPATCH_V2_TMPL(                       \
+      AT_DISPATCH_SWITCH,                     \
+      AT_DISPATCH_CASE,                       \
+      TYPE,                                   \
+      NAME,                                   \
+      AT_WRAP(BODY),                          \
+      __VA_ARGS__)

+// Unused helper macros, kept for BC:
 #define AT_AP_VAR(N, T, ...) \
  AT_EXPAND(AT_CONCAT(AT_AP, AT_NUM_ARGS(__VA_ARGS__))(AT_WRAP(N), __VA_ARGS__))
-#define AT_CONCAT(a, b) AT_CONCAT_AUX(a, b)
-#define AT_CONCAT_AUX(a, b) a##b
-#define AT_EXPAND(X) X

 // Ensure we never have too many scalar types for the expansion here to
 // support.  To bump this, you must regenerate the macros below.
@ -119,12 +102,6 @@ static_assert(static_cast<int>(c10::ScalarType::NumOptions) < 60);

 num_args = 60

-nums = ', '.join(str(i) for i in reversed(range(num_args+1)))
-args = ', '.join(f'_{i}' for i in range(1, num_args+1))
-
-print(f'#define AT_NUM_ARGS(...) AT_EXPAND(AT_NUM_ARGS_AUX(__VA_ARGS__, {nums}))')
-print(f'#define AT_NUM_ARGS_AUX({args}, N, ...) N')
-
 for i in range(1, num_args+1):
    args = ', '.join(f'_{i}' for i in range(1, i+1))
    cases = ' '.join([f'AT_DISPATCH_CASE(_{j}, N)' for j in range(1, i+1)])
@ -135,8 +112,6 @@ for i in range(1, num_args+1):
 // Begin generated code
 // clang-format off

-#define AT_NUM_ARGS(...) AT_EXPAND(AT_NUM_ARGS_AUX(__VA_ARGS__, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0))
-#define AT_NUM_ARGS_AUX(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, _46, _47, _48, _49, _50, _51, _52, _53, _54, _55, _56, _57, _58, _59, _60, N, ...) N
 #define AT_AP1(N, _1) AT_DISPATCH_CASE(_1, N)
 #define AT_AP2(N, _1, _2) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N)
 #define AT_AP3(N, _1, _2, _3) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N)
--- a/aten/src/ATen/core/CachingHostAllocator.h
+++ b/aten/src/ATen/core/CachingHostAllocator.h
@ -226,8 +226,8 @@ template <
    typename B = HostBlock<S>>
 struct CachingHostAllocatorImpl {
  virtual ~CachingHostAllocatorImpl() {
-    active_ = false;
-    if (pinned_use_background_threads()) {
+    if (active_) {
+      active_ = false;
      getBackgroundThreadPool()->waitWorkComplete();
    }
  }
@ -260,6 +260,7 @@ struct CachingHostAllocatorImpl {
    if (pinned_use_background_threads()) {
      // Launch the background thread and process events in a loop.
      static bool background_thread_flag [[maybe_unused]] = [this] {
+        active_ = true;
        getBackgroundThreadPool()->run([&]() {
          while (active_) {
            process_events();
@ -683,9 +684,9 @@ struct CachingHostAllocatorImpl {
  alignas(hardware_destructive_interference_size) std::mutex events_mutex_;
  std::deque<std::pair<E, B*>> events_; // event queue paired with block

-  // Indicates whether the object is active.
+  // Indicates whether the event-processing thread pool is active.
  // Set to false in the destructor to signal background threads to stop.
-  std::atomic<bool> active_{true};
+  std::atomic<bool> active_{false};
 protected:
  alignas(hardware_destructive_interference_size) HostStatsStaged stats_;
 };
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@ -245,6 +245,9 @@ class TORCH_API TensorBase {
  size_t weak_use_count() const noexcept {
    return impl_.weak_use_count();
  }
+  bool is_uniquely_owned() const noexcept {
+    return impl_.is_uniquely_owned();
+  }

  std::string toString() const;

--- a/aten/src/ATen/cuda/NumericLimits.cuh
+++ b/aten/src/ATen/cuda/NumericLimits.cuh
@ -55,14 +55,6 @@ struct numeric_limits<int8_t> {
  static inline __host__ __device__ int8_t upper_bound() { return INT8_MAX; }
 };

-template <>
-struct numeric_limits<uint16_t> {
-  static inline __host__ __device__ uint16_t lowest() { return 0; }
-  static inline __host__ __device__ uint16_t max() { return UINT16_MAX; }
-  static inline __host__ __device__ uint16_t lower_bound() { return 0; }
-  static inline __host__ __device__ uint16_t upper_bound() { return UINT16_MAX; }
-};
-
 template <>
 struct numeric_limits<int16_t> {
  static inline __host__ __device__ int16_t lowest() { return INT16_MIN; }
@ -71,14 +63,6 @@ struct numeric_limits<int16_t> {
  static inline __host__ __device__ int16_t upper_bound() { return INT16_MAX; }
 };

-template <>
-struct numeric_limits<uint32_t> {
-  static inline __host__ __device__ uint32_t lowest() { return 0; }
-  static inline __host__ __device__ uint32_t max() { return UINT32_MAX; }
-  static inline __host__ __device__ uint32_t lower_bound() { return 0; }
-  static inline __host__ __device__ uint32_t upper_bound() { return UINT32_MAX; }
-};
-
 template <>
 struct numeric_limits<int32_t> {
  static inline __host__ __device__ int32_t lowest() { return INT32_MIN; }
@ -87,21 +71,6 @@ struct numeric_limits<int32_t> {
  static inline __host__ __device__ int32_t upper_bound() { return INT32_MAX; }
 };

-template <>
-struct numeric_limits<uint64_t> {
-#ifdef _MSC_VER
-  static inline __host__ __device__ uint64_t lowest() { return 0; }
-  static inline __host__ __device__ uint64_t max() { return _UI64_MAX; }
-  static inline __host__ __device__ uint64_t lower_bound() { return 0; }
-  static inline __host__ __device__ uint64_t upper_bound() { return _UI64_MAX; }
-#else
-  static inline __host__ __device__ uint64_t lowest() { return 0; }
-  static inline __host__ __device__ uint64_t max() { return UINT64_MAX; }
-  static inline __host__ __device__ uint64_t lower_bound() { return 0; }
-  static inline __host__ __device__ uint64_t upper_bound() { return UINT64_MAX; }
-#endif
-};
-
 template <>
 struct numeric_limits<int64_t> {
 #ifdef _MSC_VER
--- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp
@ -21,6 +21,7 @@

 #if AT_CUDNN_ENABLED()
 #include <ATen/cudnn/cudnn-wrapper.h>
+#include <cudnn_frontend.h>
 #endif

 #if AT_MAGMA_ENABLED()
@ -351,6 +352,26 @@ long CUDAHooks::versionCuDNN() const {
 #endif
 }

+long CUDAHooks::versionRuntimeCuDNN() const {
+#if AT_CUDNN_ENABLED()
+#ifndef USE_STATIC_CUDNN
+  return cudnnGetVersion();
+#else
+  return CUDNN_VERSION;
+#endif
+#else
+  TORCH_CHECK(false, "Cannot query CuDNN version if ATen_cuda is not built with CuDNN");
+#endif
+}
+
+long CUDAHooks::versionCuDNNFrontend() const {
+#if AT_CUDNN_ENABLED()
+  return CUDNN_FRONTEND_VERSION;
+#else
+  TORCH_CHECK(false, "Cannot query CuDNN Frontend version if ATen_cuda is not built with CuDNN");
+#endif
+}
+
 long CUDAHooks::versionMIOpen() const {
 #if AT_ROCM_ENABLED()
  return MIOPEN_VERSION_MAJOR * 10000 +
--- a/aten/src/ATen/cuda/detail/CUDAHooks.h
+++ b/aten/src/ATen/cuda/detail/CUDAHooks.h
@ -49,6 +49,8 @@ struct CUDAHooks : public at::CUDAHooksInterface {
  bool hasCUDART() const override;
  long versionCUDART() const override;
  long versionCuDNN() const override;
+  long versionRuntimeCuDNN() const override;
+  long versionCuDNNFrontend() const override;
  long versionMIOpen() const override;
  std::string showConfig() const override;
  double batchnormMinEpsilonCuDNN() const override;
--- a/aten/src/ATen/detail/CUDAHooksInterface.h
+++ b/aten/src/ATen/detail/CUDAHooksInterface.h
@ -174,6 +174,14 @@ struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface {
    TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
  }

+  virtual long versionRuntimeCuDNN() const {
+    TORCH_CHECK(false, "Cannot query cuDNN version without ATen_cuda library. ", CUDA_HELP);
+  }
+
+  virtual long versionCuDNNFrontend() const {
+    TORCH_CHECK(false, "Cannot query cuDNN Frontend version without ATen_cuda library. ", CUDA_HELP);
+  }
+
  virtual long versionMIOpen() const {
    TORCH_CHECK(false, "Cannot query MIOpen version without ATen_cuda library. ", CUDA_HELP);
  }
--- a/aten/src/ATen/functorch/BatchedTensorImpl.h
+++ b/aten/src/ATen/functorch/BatchedTensorImpl.h
@ -157,6 +157,8 @@ constexpr DispatchKeySet kKeysToPropagateToWrapper({
  DispatchKey::Negative,
  DispatchKey::Conjugate,
  DispatchKey::XLA,
+  DispatchKey::XPU,
+  DispatchKey::HPU,
  DispatchKey::CUDA,
  DispatchKey::CPU,
  DispatchKey::PrivateUse1,
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -409,7 +409,7 @@ struct ConvParams {
    if (!detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda() || !cudnn_enabled) {
      return false;
    }
-    static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
+    static long cudnn_version = detail::getCUDAHooks().versionRuntimeCuDNN();
    // broken on cuDNN 9.8 - 9.14
    if (cudnn_version >= 90800 && cudnn_version < 91500) {
      if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous &&
@ -453,7 +453,7 @@ struct ConvParams {
    }
    // native kernel doesn't support 64-bit non-splittable case
    if (!(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) {
-      static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionCuDNN() : -1;
+      static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionRuntimeCuDNN() : -1;
      // TODO(eqy): remove this once cuDNN fixes 64-bit depthwise support, first broken in 9.11x
      if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) {
        if (cudnn_version < 0 || cudnn_version > 91000) {
--- a/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp
@ -5,7 +5,6 @@
 #include <ATen/native/ReduceOpsUtils.h>

 #include <ATen/Dispatch.h>
-#include <ATen/Dispatch_v2.h>
 #include <ATen/Parallel.h>
 #include <ATen/TensorIterator.h>
 #include <ATen/OpMathType.h>
@ -79,12 +78,12 @@ void min_all_kernel_impl(Tensor& result, const Tensor& input) {
    reduce_all_impl<int64_t>(result, input, upper_bound<int64_t>(),
      [=](int64_t a, int64_t b) -> int64_t { return min_impl(a, b); });
  } else {
-    AT_DISPATCH_V2(input.scalar_type(), "min_all", AT_WRAP([&] {
+    AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "min_all", [&] {
      using Vec = Vectorized<opmath_type<scalar_t>>;
      reduce_all_impl_vec<scalar_t>(result, input, upper_bound<scalar_t>(),
        [=] (scalar_t a , scalar_t b) -> scalar_t { return min_impl(a, b); },
        [=](Vec a, Vec b) -> Vec { return minimum(a, b); });
-    }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kHalf, kBFloat16);
+    });
  }
 }

@ -104,12 +103,12 @@ void max_all_kernel_impl(Tensor& result, const Tensor& input) {
    reduce_all_impl<int64_t>(result, input, lower_bound<int64_t>(),
      [=](int64_t a, int64_t b) -> int64_t { return max_impl(a, b); });
  } else {
-    AT_DISPATCH_V2(input.scalar_type(), "max_all", AT_WRAP([&] {
+    AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "max_all", [&] {
      using Vec = Vectorized<opmath_type<scalar_t>>;
      reduce_all_impl_vec<scalar_t>(result, input, lower_bound<scalar_t>(),
        [=] (scalar_t a , scalar_t b) -> scalar_t { return max_impl(a, b); },
        [=](Vec a, Vec b) -> Vec { return maximum(a, b); });
-    }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kHalf, kBFloat16);
+    });
  }
 }

@ -200,7 +199,7 @@ void aminmax_allreduce_kernel(
      }
    );
  } else {
-    AT_DISPATCH_V2(input.scalar_type(), "aminmax_cpu", AT_WRAP([&] {
+    AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, input.scalar_type(), "aminmax_cpu", [&] {
      using Vec = Vectorized<opmath_type<scalar_t>>;
      using scalar_t_pair = std::pair<scalar_t, scalar_t>;
      reduce_all_impl_vec_two_outputs<scalar_t>(
@ -215,7 +214,7 @@ void aminmax_allreduce_kernel(
        [=](Vec a, Vec b) -> Vec { return minimum(a, b); },
        [=](Vec a, Vec b) -> Vec { return maximum(a, b); }
      );
-    }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf);
+    });
  }
 }

--- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp
@ -3,7 +3,6 @@

 #include <ATen/core/Tensor.h>
 #include <ATen/Dispatch.h>
-#include <ATen/Dispatch_v2.h>
 #include <ATen/OpMathType.h>
 #include <ATen/cpu/vec/vec.h>
 #include <ATen/cpu/vec/functional.h>
@ -348,35 +347,34 @@ struct MinValuesOps: public at::native::MinOps<scalar_t> {
 };

 void min_values_kernel_impl(TensorIterator& iter) {
-  // This case is special because of Vectorized<int64_t> does not
-  // handle upper_bound<int64_t>().
-  // See: https://github.com/pytorch/pytorch/issues/43254
-  if (iter.dtype() == kLong || iter.dtype() == kUInt64) {
-    AT_DISPATCH_V2(iter.dtype(), "min_values_cpu", AT_WRAP([&iter] {
-      binary_kernel_reduce(
-        iter,
-        MinValuesOps<scalar_t>{},
-        std::pair<scalar_t, int64_t>(upper_bound<scalar_t>(), -1));
-    }), kLong, kUInt64);
+  if (iter.dtype() == kLong) {
+    // This case is special because of Vectorized<int64_t> does not
+    // handle upper_bound<int64_t>().
+    // See: https://github.com/pytorch/pytorch/issues/43254
+    using scalar_t = int64_t;
+    binary_kernel_reduce(
+      iter,
+      MinValuesOps<scalar_t>{},
+      std::pair<scalar_t, int64_t>(upper_bound<scalar_t>(), -1));
    return;
  }
-  AT_DISPATCH_V2(iter.dtype(), "min_values_cpu", AT_WRAP([&iter] {
+  AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.dtype(), "min_values_cpu", [&iter] {
    binary_kernel_reduce_vec(
      iter,
      [](scalar_t a, scalar_t b) -> scalar_t { return min_impl(a, b); },
      [](Vectorized<scalar_t> a, Vectorized<scalar_t> b) { return minimum(a, b); },
      static_cast<double>(upper_bound<scalar_t>()));
-  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
+  });
 }

 void max_values_kernel_impl(TensorIterator& iter) {
-  AT_DISPATCH_V2(iter.dtype(), "max_values_cpu", AT_WRAP([&iter] {
+  AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.dtype(), "max_values_cpu", [&iter] {
    binary_kernel_reduce_vec(
      iter,
      [](scalar_t a, scalar_t b) -> scalar_t { return max_impl(a, b); },
      [](Vectorized<scalar_t> a, Vectorized<scalar_t> b) { return maximum(a, b); },
      lower_bound<scalar_t>());
-  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
+  });
 }

 void argmax_kernel_impl(TensorIterator &iter) {
--- a/aten/src/ATen/native/cpu/TensorCompareKernel.cpp
+++ b/aten/src/ATen/native/cpu/TensorCompareKernel.cpp
@ -11,7 +11,6 @@
 #include <vector>

 #include <ATen/Dispatch.h>
-#include <ATen/Dispatch_v2.h>
 #include <ATen/Parallel.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/TensorIterator.h>
@ -107,7 +106,7 @@ void min_kernel_impl(
    bool keepdim) {
  int64_t self_dim_size = ensure_nonempty_size(self, dim);

-  AT_DISPATCH_V2(self.scalar_type(), "min_cpu", AT_WRAP([&] {
+  AT_DISPATCH_ALL_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool, self.scalar_type(), "min_cpu", [&] {
    compare_base_kernel<scalar_t>(result, indice, self, dim, keepdim, [&] (
      scalar_t* result_data, int64_t* indice_data,
      const scalar_t* self_data, auto self_dim_stride) {
@ -129,7 +128,7 @@ void min_kernel_impl(
        *indice_data = index;
      }
    );
-  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool);
+  });
 }

 void max_kernel_impl(
@ -140,7 +139,7 @@ void max_kernel_impl(
    bool keepdim) {
  int64_t self_dim_size = ensure_nonempty_size(self, dim);

-  AT_DISPATCH_V2(self.scalar_type(), "max_cpu", AT_WRAP([&] {
+  AT_DISPATCH_ALL_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool, self.scalar_type(), "max_cpu", [&] {
    compare_base_kernel<scalar_t>(result, indice, self, dim, keepdim, [&] (
      scalar_t* result_data, int64_t* indice_data,
      const scalar_t* self_data, auto self_dim_stride) {
@ -162,7 +161,7 @@ void max_kernel_impl(
        *indice_data = index;
      }
    );
-  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), ScalarType::Half, ScalarType::BFloat16, ScalarType::Bool);
+  });
 }

 void aminmax_kernel(
@ -187,7 +186,7 @@ void aminmax_kernel(
    return;
  }

-  AT_DISPATCH_V2(self.scalar_type(), "aminmax_cpu", AT_WRAP([&] {
+  AT_DISPATCH_ALL_TYPES_AND3(ScalarType::Bool, ScalarType::BFloat16, ScalarType::Half, self.scalar_type(), "aminmax_cpu", [&] {
    compare_base_kernel<scalar_t, scalar_t>(min_result, max_result, self, wrap_dim, keepdim, [&] (
      scalar_t* min_result_data, scalar_t* max_result_data,
      const scalar_t* self_data, auto self_dim_stride) {
@ -210,7 +209,7 @@ void aminmax_kernel(
        *max_result_data = max_number;
      }
    );
-  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), ScalarType::Bool, ScalarType::BFloat16, ScalarType::Half);
+  });
 }

 void where_kernel_impl(TensorIterator &iter) {
--- a/aten/src/ATen/native/cuda/CUDALoops.cuh
+++ b/aten/src/ATen/native/cuda/CUDALoops.cuh
@ -884,6 +884,69 @@ struct type_specialized_kernel_launcher {
  }
 };

+template <int arg_index>
+struct type_specialized_broadcast_kernel_launcher {
+  template <
+      typename func_t,
+      typename array_t,
+      typename dtypes_t,
+      typename calc_t>
+  static void apply(
+      int64_t numel,
+      func_t f,
+      array_t data,
+      dtypes_t dtypes,
+      calc_t offset_calc) {
+        using traits = function_traits<func_t>;
+        using ret_t = typename traits::result_type;
+        using arg0_t = typename traits::template arg<0>::type;
+        using arg1_t = typename traits::template arg<1>::type;
+        if (dtypes[0] == rt_binary_specializations[arg_index][0] &&
+          dtypes[1] == rt_binary_specializations[arg_index][1] &&
+          dtypes[2] == rt_binary_specializations[arg_index][2]) {
+            using ret_cpp_t = c10::impl::ScalarTypeToCPPTypeT<rt_binary_specializations[arg_index][0]>;
+            using arg0_cpp_t = c10::impl::ScalarTypeToCPPTypeT<rt_binary_specializations[arg_index][1]>;
+            using arg1_cpp_t = c10::impl::ScalarTypeToCPPTypeT<rt_binary_specializations[arg_index][2]>;
+            constexpr int grp_sz = 128;
+            launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
+              if (unrl) {
+                auto offsets0 = offset_calc.get(idx);
+                auto offsets1 = offset_calc.get(idx + grp_sz);
+                auto offsets2 = offset_calc.get(idx + grp_sz * 2);
+                auto offsets3 = offset_calc.get(idx + grp_sz * 3);
+                void* out0 = data[0] + offsets0[0];
+                void* out1 = data[0] + offsets1[0];
+                void* out2 = data[0] + offsets2[0];
+                void* out3 = data[0] + offsets3[0];
+                auto u = c10::load<arg0_cpp_t>(data[1] + offsets0[1]);
+                auto v = c10::load<arg1_cpp_t>(data[2] + offsets0[2]);
+                ret_t result0 = f(c10::convert<arg0_t>(u), c10::convert<arg1_t>(v));
+                auto u1 = c10::load<arg0_cpp_t>(data[1] + offsets1[1]);
+                auto v1 = c10::load<arg1_cpp_t>(data[2]+ offsets1[2]);
+                ret_t result1 = f(c10::convert<arg0_t>(u1), c10::convert<arg1_t>(v1));
+                auto u2 = c10::load<arg0_cpp_t>(data[1] + offsets2[1]);
+                auto v2 = c10::load<arg1_cpp_t>(data[2] + offsets2[2]);
+                ret_t result2 = f(c10::convert<arg0_t>(u2), c10::convert<arg1_t>(v2));
+                auto u3 = c10::load<arg0_cpp_t>(data[1] + offsets3[1]);
+                auto v3 = c10::load<arg1_cpp_t>(data[2] + offsets3[2]);
+                ret_t result3 = f(c10::convert<arg0_t>(u3), c10::convert<arg1_t>(v3));
+                *(ret_cpp_t*)out0 = c10::convert<ret_cpp_t>(result0);
+                *(ret_cpp_t*)out1 = c10::convert<ret_cpp_t>(result1);
+                *(ret_cpp_t*)out2 = c10::convert<ret_cpp_t>(result2);
+                *(ret_cpp_t*)out3 = c10::convert<ret_cpp_t>(result3);
+              } else {
+                auto offsets = offset_calc.get(idx);
+                void* out = data[0] + offsets[0];
+                auto u = c10::load<arg0_cpp_t>(data[1] + offsets[1]);
+                auto v = c10::load<arg1_cpp_t>(data[2] + offsets[2]);
+                ret_t result = f(c10::convert<arg0_t>(u), c10::convert<arg1_t>(v));
+                *(ret_cpp_t*)out = c10::convert<ret_cpp_t>(result);
+              }
+            });
+        }
+      }
+};
+
 } // namespace
 #endif

@ -1002,6 +1065,32 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
    }
    auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter);
 #ifdef USE_ROCM
+    if (check_binary_rt_types_for_specialization(iter)) {
+      // constexpr to reduce the amount of kernels generated for
+      // broadcast elementwise with mexed dtypes and limit which functors are actually
+      // applied to the load and store at compile time.
+      using func_tuple = typename traits::ArgsTuple;
+      if constexpr (
+        std::is_same_v<float, arg0_t> && traits::arity == 2 &&
+        check_binary_functor_types_for_specialization<
+          func_tuple,
+          float,
+          float,
+          traits::arity,
+          /*arg_num=*/0>::check()) {
+            memory::detail::static_unroll<
+              type_specialized_broadcast_kernel_launcher,
+              rt_binary_specializations.size()>::with_args(
+                numel,
+                f,
+                data,
+                dtypes,
+                offset_calc
+            );
+            return;
+      }
+    }
+
    constexpr int grp_sz = 128;
    launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
      if (unrl) {
--- a/aten/src/ATen/native/cuda/ReduceAMinMaxKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceAMinMaxKernel.cu
@ -1,6 +1,5 @@
 #define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/Dispatch.h>
-#include <ATen/Dispatch_v2.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/ReduceAllOps.h>
@ -29,22 +28,22 @@ void _min_max_values_kernel_cuda_impl(TensorIterator& iter) {
 }

 void aminmax_allreduce_launch_kernel(TensorIterator& iter) {
-  AT_DISPATCH_V2(
-      iter.input_dtype(), "aminmax_all_cuda", AT_WRAP([&] {
+  AT_DISPATCH_ALL_TYPES_AND3(
+      kBFloat16, kHalf, kBool, iter.input_dtype(), "aminmax_all_cuda", [&] {
        _min_max_values_kernel_cuda_impl<scalar_t>(iter);
-      }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
+      });
 }

 void aminmax_launch_kernel(TensorIterator& iter) {
-  AT_DISPATCH_V2(
-      iter.input_dtype(), "aminmax_cuda", AT_WRAP([&]() {
+  AT_DISPATCH_ALL_TYPES_AND3(
+      kBFloat16, kHalf, kBool, iter.input_dtype(), "aminmax_cuda", [&]() {
        gpu_reduce_kernel<scalar_t, scalar_t>(
            iter,
            MinMaxOps<scalar_t, scalar_t, int32_t>{},
            thrust::pair<scalar_t, scalar_t>(
                at::numeric_limits<scalar_t>::upper_bound(),
                at::numeric_limits<scalar_t>::lower_bound()));
-      }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
+      });
 }

 } // namespace at::native
--- a/aten/src/ATen/native/cuda/ReduceMaxValuesKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceMaxValuesKernel.cu
@ -1,6 +1,5 @@
 #define TORCH_ASSERT_NO_OPERATORS
 #include <ATen/Dispatch.h>
-#include <ATen/Dispatch_v2.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/native/DispatchStub.h>
 #include <ATen/native/ReduceAllOps.h>
@ -34,27 +33,27 @@ void max_values_kernel_cuda_impl(TensorIterator& iter) {
 }

 void max_values_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_V2(
-      iter.dtype(), "max_values_cuda", AT_WRAP([&]() {
+  AT_DISPATCH_ALL_TYPES_AND3(
+      kBFloat16, kHalf, kBool, iter.dtype(), "max_values_cuda", [&]() {
        max_values_kernel_cuda_impl<scalar_t>(iter);
-      }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
+      });
 }

 void max_launch_kernel(TensorIterator& iter) {
-  AT_DISPATCH_V2(
-      iter.input_dtype(), "max_cuda", AT_WRAP([&]() {
+  AT_DISPATCH_ALL_TYPES_AND3(
+      kBFloat16, kHalf, kBool, iter.input_dtype(), "max_cuda", [&]() {
        gpu_reduce_kernel<scalar_t, scalar_t>(
            iter,
            MaxOps<scalar_t>{},
            thrust::pair<scalar_t, int64_t>(
                at::numeric_limits<scalar_t>::lower_bound(), 0));
-      }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
+      });
 }

 void max_all_launch_kernel(TensorIterator &iter) {
-  AT_DISPATCH_V2(iter.input_dtype(), "max_all_cuda", AT_WRAP([&] {
+  AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.input_dtype(), "max_all_cuda", [&] {
    max_values_kernel_cuda_impl<scalar_t>(iter);
-  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
+  });
 }

 REGISTER_DISPATCH(max_values_stub, &max_values_kernel_cuda)
--- a/aten/src/ATen/native/cuda/ReduceMinValuesKernel.cu
+++ b/aten/src/ATen/native/cuda/ReduceMinValuesKernel.cu
@ -12,7 +12,6 @@
 #include <ATen/NumericUtils.h>

 #include <ATen/Dispatch.h>
-#include <ATen/Dispatch_v2.h>
 #include <ATen/NumericUtils.h>
 #include <ATen/cuda/NumericLimits.cuh>

@ -34,24 +33,24 @@ void min_values_kernel_cuda_impl(TensorIterator& iter) {
 }

 void min_values_kernel_cuda(TensorIterator& iter) {
-  AT_DISPATCH_V2(iter.dtype(), "min_values_cuda", AT_WRAP([&]() {
+  AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.dtype(), "min_values_cuda", [&]() {
    min_values_kernel_cuda_impl<scalar_t>(iter);
-  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
+  });
 }

 void min_launch_kernel(TensorIterator &iter) {
-  AT_DISPATCH_V2(iter.input_dtype(), "min_cuda", AT_WRAP([&]() {
+  AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.input_dtype(), "min_cuda", [&]() {
    gpu_reduce_kernel<scalar_t, scalar_t>(
      iter,
      MinOps<scalar_t>{},
      thrust::pair<scalar_t, int64_t>(at::numeric_limits<scalar_t>::upper_bound(), 0));
-  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
+  });
 }

 void min_all_launch_kernel(TensorIterator &iter) {
-  AT_DISPATCH_V2(iter.input_dtype(), "min_all_cuda", AT_WRAP([&] {
+  AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, iter.input_dtype(), "min_all_cuda", [&] {
    min_values_kernel_cuda_impl<scalar_t>(iter);
-  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf, kBool);
+  });
 }

 REGISTER_DISPATCH(min_values_stub, &min_values_kernel_cuda)
--- a/aten/src/ATen/native/mkldnn/xpu/detail/QConv.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/QConv.cpp
@ -133,7 +133,7 @@ at::Tensor quantized_convolution(
  // supported in conv.
  mask_weight = weight_zero_points.numel() > 1 ? 1 : 0;
  if (groups > 1 && weight_zero_points.numel() > 1)
-    mask_weight = (2 ^ 0) | (2 ^ 1); // 2^0 (group) | 2^1 (output channel)
+    mask_weight = (1 << 0) | (1 << 1); // 2^0 (group) | 2^1 (output channel)
  dnnl::primitive_attr pattr;

  bool src_need_zp = (act_zero_point != 0);
--- a/aten/src/ATen/native/mps/operations/Blas.mm
+++ b/aten/src/ATen/native/mps/operations/Blas.mm
@ -141,6 +141,9 @@ static Tensor& addmv_out_mps_impl(const Tensor& self,
  };

  MPSStream* stream = at::mps::getCurrentMPSStream();
+  if (result.numel() == 0) {
+    return result;
+  }
  Tensor matMulVec = at::mm(mat, vec.unsqueeze(1)).squeeze(1);

  @autoreleasepool {
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -2803,7 +2803,7 @@
 - func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
  dispatch:
-    CPU, CUDA, MPS: floor_divide_out
+    CPU, CUDA, MPS, MTIA: floor_divide_out
    SparseCPU, SparseCUDA, SparseMPS: floor_divide_out_sparse_zerodim

 - func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
@ -4292,6 +4292,7 @@
  dispatch:
    SparseCPU: sparse_sparse_matmul_cpu
    SparseCUDA: sparse_sparse_matmul_cuda
+    SparseMPS: sparse_sparse_matmul_mps
  autogen: _sparse_sparse_matmul.out

 - func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
@ -4383,7 +4384,7 @@
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: mv
-    SparseCPU, SparseCUDA: mv_sparse
+    SparseCPU, SparseCUDA, SparseMPS: mv_sparse

 - func: mv.out(Tensor self, Tensor vec, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
@ -9832,7 +9833,7 @@
  structured_delegate: erfinv.out
  variants: method, function
  dispatch:
-    SparseCPU, SparseCUDA: erfinv_sparse
+    SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr
  tags: pointwise

@ -9841,7 +9842,7 @@
  structured_delegate: erfinv.out
  variants: method
  dispatch:
-    SparseCPU, SparseCUDA: erfinv_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr_
  tags: pointwise

@ -9851,7 +9852,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS: erfinv_out
-    SparseCPU, SparseCUDA: erfinv_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr_out
  tags: pointwise

--- a/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
+++ b/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
@ -10,6 +10,10 @@
 #include <ATen/NativeFunctions.h>
 #else
 #include <ATen/ops/_coalesce_native.h>
+#include <ATen/ops/repeat_interleave_native.h>
+#include <ATen/ops/cumsum.h>
+#include <ATen/ops/_sparse_sparse_matmul_native.h>
+#include <ATen/ops/_sparse_coo_tensor_unsafe.h>
 #include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
 #include <ATen/ops/cat.h>
 #include <ATen/ops/add_native.h>
@ -888,5 +892,114 @@ static void sparse_mask_intersection_out_mps_kernel(
      /*coalesce_mask=*/false);
 }

+Tensor sparse_sparse_matmul_mps(const Tensor& mat1_, const Tensor& mat2_) {
+  TORCH_CHECK(mat1_.is_sparse() && mat2_.is_sparse(),
+              "sparse_sparse_matmul_mps: both inputs must be sparse COO tensors");
+  TORCH_CHECK(mat1_.is_mps() && mat2_.is_mps(),
+              "sparse_sparse_matmul_mps: both inputs must be on MPS device");
+  TORCH_CHECK(mat1_.dim() == 2 && mat2_.dim() == 2,
+              "sparse_sparse_matmul_mps: both inputs must be 2D matrices");
+  TORCH_CHECK(mat1_.dense_dim() == 0 && mat2_.dense_dim() == 0,
+              "sparse_sparse_matmul_mps: only scalar values supported (dense_dim == 0)");
+  TORCH_CHECK(mat1_.size(1) == mat2_.size(0),
+              "mat1 and mat2 shapes cannot be multiplied (", mat1_.size(0), "x", mat1_.size(1), " and ", mat2_.size(0), "x", mat2_.size(1), ")");
+  TORCH_CHECK(mat1_.scalar_type() == mat2_.scalar_type(),
+              "sparse_sparse_matmul_mps: mat1 dtype ", mat1_.scalar_type(),
+              " does not match mat2 dtype ", mat2_.scalar_type());
+
+  const auto device = mat1_.device();
+
+  auto A = mat1_.coalesce();
+  auto B = mat2_.coalesce();
+
+  const auto I = A.size(0);
+  const auto K = A.size(1);
+  const auto N = B.size(1);
+
+  const auto nnzA = A._nnz();
+  const auto nnzB = B._nnz();
+
+  // Early empty result, return an empty, coalesced tensor
+  if (I == 0 || N == 0 || K == 0 || nnzA == 0 || nnzB == 0) {
+    auto empty_idx = at::empty({2, 0}, at::device(device).dtype(at::kLong));
+    auto empty_val = at::empty({0}, at::device(device).dtype(mat1_.scalar_type()));
+    auto out = _sparse_coo_tensor_unsafe(empty_idx, empty_val, {I, N}, mat1_.options());
+    out._coalesced_(true);
+    return out;
+  }
+
+  const auto computeDtype = at::result_type(mat1_, mat2_);
+
+  auto A_idx = A._indices().contiguous();
+  auto A_val = A._values().to(computeDtype).contiguous();
+  auto A_i = A_idx.select(0, 0).contiguous();
+  auto A_k = A_idx.select(0, 1).contiguous();
+
+  auto B_idx = B._indices().contiguous();
+  auto B_val = B._values().to(computeDtype).contiguous();
+  auto B_k = B_idx.select(0, 0).contiguous();
+  auto B_j = B_idx.select(0, 1).contiguous();
+
+  // csr-style row pointers for B by k (the shared dimension)
+  Tensor row_ptr_B;
+  {
+    auto batch_ptr = at::tensor({0LL, nnzB}, at::device(device).dtype(at::kLong));
+    row_ptr_B = at::empty({K + 1}, at::device(device).dtype(at::kLong));
+    build_row_ptr_per_batch_mps(B_k, batch_ptr, /*B=*/1, /*I=*/K, row_ptr_B);
+  }
+
+  auto row_ptr_B_lo = row_ptr_B.narrow(0, 0, K);
+  auto row_ptr_B_hi = row_ptr_B.narrow(0, 1, K);
+  auto deg_B = row_ptr_B_hi.sub(row_ptr_B_lo);
+
+  auto counts = deg_B.index_select(0, A_k);
+
+  const int64_t P = counts.sum().item<int64_t>();
+  if (P == 0) {
+    auto empty_idx = at::empty({2, 0}, at::device(device).dtype(at::kLong));
+    auto empty_val = at::empty({0}, at::device(device).dtype(mat1_.scalar_type()));
+    auto out = _sparse_coo_tensor_unsafe(empty_idx, empty_val, {I, N}, mat1_.options());
+    out._coalesced_(true);
+    return out;
+  }
+
+  auto group_ids = repeat_interleave_mps(counts);
+
+  // exclusive cumsum of counts
+  auto offsets = cumsum(counts, /*dim=*/0).sub(counts);
+  auto offsets_gather = offsets.index_select(0, group_ids);
+  auto within = at::arange(P, at::device(device).dtype(at::kLong)).sub(offsets_gather);
+
+  // Map each output element to its source B row and position
+  auto k_per_out = A_k.index_select(0, group_ids);
+  auto start_in_B = row_ptr_B.index_select(0, k_per_out);
+  auto seg_index = start_in_B.add(within);
+
+  // Assemble candidate coo pairs and values
+  auto i_out = A_i.index_select(0, group_ids).contiguous();
+  auto j_out = B_j.index_select(0, seg_index).contiguous();
+  auto vA_out = A_val.index_select(0, group_ids).contiguous();
+  auto vB_out = B_val.index_select(0, seg_index).contiguous();
+  auto v_out = vA_out.mul(vB_out);
+
+  // build (2, P) indices
+  auto out_indices = at::empty({2, P}, at::device(device).dtype(at::kLong)).contiguous();
+  out_indices.select(0, 0).copy_(i_out);
+  out_indices.select(0, 1).copy_(j_out);
+
+  auto result = _sparse_coo_tensor_unsafe(
+      out_indices, v_out, {I, N}, mat1_.options().dtype(computeDtype));
+
+  result = result.coalesce();
+
+  if (result.scalar_type() != mat1_.scalar_type()) {
+    auto cast_vals = result._values().to(mat1_.scalar_type());
+    auto out = _sparse_coo_tensor_unsafe(result._indices(), cast_vals, {I, N}, mat1_.options());
+    out._coalesced_(true);
+    return out;
+  }
+  return result;
+}
+
 REGISTER_MPS_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_mps_kernel);
 } // namespace at::native
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
@ -478,7 +478,7 @@ bool check_cudnn_tensor_shapes(sdp_params const& params, bool debug) {
  const auto s_k = params.key.sym_size(2);
  const auto d_qk = params.query.sym_size(3);
  const auto d_v = params.value.sym_size(3);
-  long cudnn_version = at::detail::getCUDAHooks().versionCuDNN();
+  long cudnn_version = at::detail::getCUDAHooks().versionRuntimeCuDNN();
  if (cudnn_version < 8903) {
    if (debug) {
      TORCH_WARN("SDPA fprop requires cudnn 8.9.3 or higher");
@ -709,7 +709,7 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
  return false;
 #endif
 #if defined(CUDNN_VERSION)
-  static auto cudnn_version = cudnnGetVersion();
+  static auto cudnn_version = at::detail::getCUDAHooks().versionRuntimeCuDNN();
  if (params.dropout > 0.0 && cudnn_version > 91100 && cudnn_version < 91400) {
    if (debug) {
      TORCH_WARN(CUDNN_VERSION, " cuDNN version does not support droppout in SDPA (9.11 - 9.13).");
--- a/aten/tools/valgrind.sup
+++ b/aten/tools/valgrind.sup
@ -10,6 +10,13 @@
   ...
 }

+{
+   ignore_empty_generic_uninitialised_conditional_jump
+   Memcheck:Cond
+   fun:_ZN2at6detail13empty_genericEN3c108ArrayRefIlEEPNS1_9AllocatorENS1_14DispatchKeySetENS1_10ScalarTypeESt8optionalINS1_12MemoryFormatEE
+   ...
+}
+
 {
   Cond_cuda
   Memcheck:Cond
--- a/benchmarks/sparse/spmm.py
+++ b/benchmarks/sparse/spmm.py
@ -52,19 +52,18 @@ def test_sparse_coo_and_csr(m, n, k, nnz, test_count):
        start.record()
        coo.matmul(mat)
        stop.record()
-
        times.append(start.elapsed_time(stop))

-        coo_mean_time = sum(times) / len(times)
+    coo_mean_time = sum(times) / len(times)

-        times = []
-        for _ in range(test_count):
-            start.record()
-            csr.matmul(mat)
-            stop.record()
-            times.append(start.elapsed_time(stop))
+    times = []
+    for _ in range(test_count):
+        start.record()
+        csr.matmul(mat)
+        stop.record()
+        times.append(start.elapsed_time(stop))

-            csr_mean_time = sum(times) / len(times)
+    csr_mean_time = sum(times) / len(times)

    return coo_mean_time, csr_mean_time

--- a/c10/core/AutogradState.h
+++ b/c10/core/AutogradState.h
@ -1,6 +1,8 @@
 #pragma once

+#include <c10/core/SafePyObject.h>
 #include <c10/macros/Export.h>
+#include <optional>

 namespace c10 {

@ -15,7 +17,8 @@ struct C10_API AutogradState {
      bool inference_mode,
      bool fw_grad_mode,
      bool multithreading_enabled)
-      : grad_mode_(grad_mode),
+      : graph_exec_group_(std::nullopt),
+        grad_mode_(grad_mode),
        inference_mode_(inference_mode),
        fw_grad_mode_(fw_grad_mode),
        multithreading_enabled_(multithreading_enabled),
@ -41,6 +44,10 @@ struct C10_API AutogradState {
    view_replay_enabled_ = view_replay_enabled;
  }

+  void set_graph_exec_group(std::optional<SafePyObject> group) {
+    graph_exec_group_ = std::move(group);
+  }
+
  bool get_grad_mode() const {
    return grad_mode_;
  }
@ -61,7 +68,12 @@ struct C10_API AutogradState {
    return view_replay_enabled_;
  }

+  const std::optional<SafePyObject>& get_graph_exec_group() const {
+    return graph_exec_group_;
+  }
+
 private:
+  std::optional<SafePyObject> graph_exec_group_;
  bool grad_mode_ : 1;
  bool inference_mode_ : 1;
  bool fw_grad_mode_ : 1;
--- a/c10/core/CachingDeviceAllocator.h
+++ b/c10/core/CachingDeviceAllocator.h
@ -96,6 +96,10 @@ struct C10_API DeviceAllocator : public c10::Allocator {

  // Resets peak memory usage statistics for the specified device
  virtual void resetPeakStats(c10::DeviceIndex device) = 0;
+
+  // Return the free memory size and total memory size in bytes for the
+  // specified device.
+  virtual std::pair<size_t, size_t> getMemoryInfo(c10::DeviceIndex device) = 0;
 };

 // This function is used to get the DeviceAllocator for a specific device type
--- a/c10/core/SafePyObject.h
+++ b/c10/core/SafePyObject.h
@ -44,7 +44,7 @@ struct C10_API SafePyObject {
      (*other.pyinterpreter_)->incref(other.data_);
    }
    if (data_ != nullptr) {
-      (*pyinterpreter_)->decref(data_, /*has_pyobj_slot*/ false);
+      (*pyinterpreter_)->decref(data_);
    }
    data_ = other.data_;
    pyinterpreter_ = other.pyinterpreter_;
@ -53,7 +53,7 @@ struct C10_API SafePyObject {

  ~SafePyObject() {
    if (data_ != nullptr) {
-      (*pyinterpreter_)->decref(data_, /*has_pyobj_slot*/ false);
+      (*pyinterpreter_)->decref(data_);
    }
  }

--- a/c10/core/StorageImpl.cpp
+++ b/c10/core/StorageImpl.cpp
@ -48,6 +48,30 @@ void warnDeprecatedDataPtr() {
  TORCH_CHECK(false, "Cannot access data pointer of Storage that is invalid.");
 }

+void StorageImpl::incref_pyobject() const {
+  // Because intrusive_ptr incref uses relaxed memory order, we need to
+  // do an acquire fence to ensure that the kHasPyObject bit was
+  // observed before the load of the PyObject* below.
+  // NB: This is a no-op on x86/x86-64
+  std::atomic_thread_fence(std::memory_order_acquire);
+
+  PyObject* obj = pyobj_slot_.load_pyobj();
+  (*pyobj_slot_.pyobj_interpreter())->incref(obj);
+}
+
+void StorageImpl::decref_pyobject() const {
+  PyObject* obj = pyobj_slot_.load_pyobj();
+  (*pyobj_slot_.pyobj_interpreter())->decref(obj);
+}
+
+bool StorageImpl::try_incref_pyobject() const {
+  c10::impl::PyInterpreter* interp = pyobj_slot_.pyobj_interpreter();
+  if (C10_UNLIKELY(!interp)) {
+    return false;
+  }
+  return (*interp)->try_incref(pyobj_slot_);
+}
+
 void SetStorageImplCreate(DeviceType t, StorageImplCreateHelper fptr) {
  // Allowlist verification.
  // Only if the devicetype is in the allowlist,
--- a/c10/core/StorageImpl.h
+++ b/c10/core/StorageImpl.h
@ -105,6 +105,12 @@ struct C10_API StorageImpl : public c10::intrusive_ptr_target {
    data_ptr_.clear();
  }

+  void incref_pyobject() const override final;
+
+  void decref_pyobject() const override final;
+
+  bool try_incref_pyobject() const override final;
+
  size_t nbytes() const {
    // OK to do this instead of maybe_as_int as nbytes is guaranteed positive
    TORCH_CHECK(!size_bytes_is_heap_allocated_);
@ -370,4 +376,14 @@ C10_API c10::intrusive_ptr<c10::StorageImpl> make_storage_impl(
    bool resizable,
    std::optional<at::Device> device_opt);

+namespace detail {
+template <class T>
+struct TargetTraits<
+    T,
+    std::enable_if_t<
+        std::is_base_of_v<c10::StorageImpl, std::remove_cv_t<T>>>> {
+  static constexpr bool can_have_pyobject = true;
+};
+} // namespace detail
+
 } // namespace c10
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@ -277,7 +277,6 @@ void TensorImpl::release_resources() {
  if (storage_) {
    storage_ = {};
  }
-  pyobj_slot_.maybe_destroy_pyobj();
 }

 #ifndef C10_DISABLE_TENSORIMPL_EXTENSIBILITY
@ -989,6 +988,30 @@ void TensorImpl::empty_tensor_restride_symint(MemoryFormat memory_format) {
  }
 }

+void TensorImpl::incref_pyobject() const {
+  // Because intrusive_ptr incref uses relaxed memory order, we need to
+  // do an acquire fence to ensure that the kHasPyObject bit was
+  // observed before the load of the PyObject* below.
+  // NB: This is a no-op on x86/x86-64
+  std::atomic_thread_fence(std::memory_order_acquire);
+
+  PyObject* obj = pyobj_slot_.load_pyobj();
+  (*pyobj_slot_.pyobj_interpreter())->incref(obj);
+}
+
+void TensorImpl::decref_pyobject() const {
+  PyObject* obj = pyobj_slot_.load_pyobj();
+  (*pyobj_slot_.pyobj_interpreter())->decref(obj);
+}
+
+bool TensorImpl::try_incref_pyobject() const {
+  c10::impl::PyInterpreter* interp = pyobj_slot_.pyobj_interpreter();
+  if (C10_UNLIKELY(!interp)) {
+    return false;
+  }
+  return (*interp)->try_incref(pyobj_slot_);
+}
+
 namespace impl {

 namespace {
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@ -2176,6 +2176,12 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    return &pyobj_slot_;
  }

+  void incref_pyobject() const override final;
+
+  void decref_pyobject() const override final;
+
+  bool try_incref_pyobject() const override final;
+
 private:
  // See NOTE [std::optional operator usage in CUDA]
  // We probably don't want to expose this publicly until
@ -3077,6 +3083,17 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
  friend class C10_TensorImpl_Size_Check_Dummy_Class;
 };

+namespace detail {
+
+template <class T>
+struct TargetTraits<
+    T,
+    std::enable_if_t<std::is_base_of_v<c10::TensorImpl, std::remove_cv_t<T>>>> {
+  static constexpr bool can_have_pyobject = true;
+};
+
+} // namespace detail
+
 // Note [TensorImpl size constraints]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 // Changed the size of TensorImpl?  If the size went down, good for
--- a/c10/core/impl/PyInterpreter.cpp
+++ b/c10/core/impl/PyInterpreter.cpp
@ -11,8 +11,11 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {

  void incref(PyObject* pyobj) const override {} // do nothing

-  void decref(PyObject* pyobj, bool has_pyobj_slot) const override {
-  } // do nothing
+  void decref(PyObject* pyobj) const override {} // do nothing
+
+  bool try_incref(const c10::impl::PyObjectSlot& pyobj_slot) const override {
+    return false;
+  }

 #define PANIC(m)              \
  TORCH_INTERNAL_ASSERT(      \
@ -20,6 +23,10 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
      "attempted to call " #m \
      " on a Tensor with nontrivial PyObject after corresponding interpreter died")

+  size_t refcnt(PyObject* pyobj) const override {
+    PANIC(refcnt);
+  }
+
  c10::intrusive_ptr<TensorImpl> detach(const TensorImpl* self) const override {
    PANIC(detach);
  }
--- a/c10/core/impl/PyInterpreter.h
+++ b/c10/core/impl/PyInterpreter.h
@ -18,6 +18,9 @@ namespace c10 {
 struct IValue;
 class OperatorHandle;
 struct TensorImpl;
+namespace impl {
+struct PyObjectSlot;
+} // namespace impl
 } // namespace c10

 namespace torch::jit {
@ -126,9 +129,12 @@ struct C10_API PyInterpreterVTable {

  // Run Py_INCREF on a PyObject.
  virtual void incref(PyObject* pyobj) const = 0;
-  // Run Py_DECREF on a PyObject.  We DO NOT assume the GIL is held on call
-  // See NOTE [PyInterpreter::decref takes a `has_pyobj_slot` arg]
-  virtual void decref(PyObject* pyobj, bool has_pyobj_slot) const = 0;
+  // Run Py_DECREF on a PyObject.  We DO NOT assume the GIL is held on call.
+  virtual void decref(PyObject* pyobj) const = 0;
+  // Run PyUnstable_TryIncRef on a PyObject if it's not NULL.
+  virtual bool try_incref(const c10::impl::PyObjectSlot& pyobj_slot) const = 0;
+  // Run Py_REFCNT on a PyObject.
+  virtual size_t refcnt(PyObject* pyobj) const = 0;

  // Perform a detach by deferring to the __torch_dispatch__ implementation of
  // detach, which will also arrange for the PyObject to get copied in this
--- a/c10/core/impl/PyObjectSlot.cpp
+++ b/c10/core/impl/PyObjectSlot.cpp
@ -1,56 +0,0 @@
-#include <c10/core/impl/PyObjectSlot.h>
-
-namespace c10::impl {
-
-PyObjectSlot::PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}
-
-PyObjectSlot::~PyObjectSlot() {
-  maybe_destroy_pyobj();
-}
-
-void PyObjectSlot::maybe_destroy_pyobj() {
-  if (owns_pyobj()) {
-    TORCH_INTERNAL_ASSERT(pyobj_interpreter_ != nullptr);
-    TORCH_INTERNAL_ASSERT(pyobj_ != nullptr);
-    (*pyobj_interpreter_.load(std::memory_order_acquire))
-        ->decref(_unchecked_untagged_pyobj(), /*has_pyobj_slot*/ true);
-    // NB: this destructor can only be entered when there are no
-    // references to this C++ object (obviously), NOR any references
-    // to the PyObject (if there are references to the PyObject,
-    // then the PyObject holds an owning reference to the tensor).
-    // So it is OK to clear pyobj_ here as it is impossible for it to
-    // be used again (modulo weak reference races)
-    pyobj_ = nullptr; // for safety
-  }
-}
-
-PyInterpreter* PyObjectSlot::pyobj_interpreter() {
-  return pyobj_interpreter_.load(std::memory_order_acquire);
-}
-
-PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
-  // NOLINTNEXTLINE(performance-no-int-to-ptr)
-  return reinterpret_cast<PyObject*>(
-      reinterpret_cast<uintptr_t>(pyobj_) & ~0x1ULL);
-}
-
-PyInterpreter& PyObjectSlot::load_pyobj_interpreter() const {
-  auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
-  if (interpreter) {
-    return *interpreter;
-  }
-  TORCH_CHECK(false, "cannot access PyObject for Tensor - no interpreter set");
-}
-
-bool PyObjectSlot::owns_pyobj() {
-  // NOLINTNEXTLINE(performance-no-int-to-ptr)
-  return reinterpret_cast<uintptr_t>(pyobj_) & 1;
-}
-
-void PyObjectSlot::set_owns_pyobj(bool b) {
-  // NOLINTNEXTLINE(performance-no-int-to-ptr)
-  pyobj_ = reinterpret_cast<PyObject*>(
-      reinterpret_cast<uintptr_t>(_unchecked_untagged_pyobj()) | b);
-}
-
-} // namespace c10::impl
--- a/c10/core/impl/PyObjectSlot.h
+++ b/c10/core/impl/PyObjectSlot.h
@ -8,117 +8,70 @@

 #include <atomic>

+namespace torch::utils {
+class PyObjectPreservation;
+}
+
 namespace c10::impl {

 struct C10_API PyObjectSlot {
 public:
-  PyObjectSlot();
-
-  ~PyObjectSlot();
-
-  void maybe_destroy_pyobj();
-
-  // Associate the TensorImpl with the specified PyObject, and, if necessary,
-  // also tag the interpreter.
-  //
-  // NB: This lives in a header so that we can inline away the switch on status
-  //
-  // NB: THIS FUNCTION CAN RAISE AN EXCEPTION.  Make sure to clean up after
-  // PyObject if necessary!
-  void init_pyobj(PyObject* pyobj) {
-    pyobj_interpreter_.store(
-        getGlobalPyInterpreter(), std::memory_order_relaxed);
-    pyobj_ = pyobj;
-  }
+  PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}

  // Query the PyObject interpreter.  This may return null if there is no
-  // interpreter.  This is racy!
-  PyInterpreter* pyobj_interpreter();
-
-  PyObject* _unchecked_untagged_pyobj() const;
-
-  // Test the interpreter tag.  If tagged for the current interpreter, return
-  // a non-nullopt (but possibly null) PyObject.  If (possibly) untagged,
-  // returns a nullopt.  If it is definitely invalid, raises an error.
-  //
-  // If `ignore_hermetic_tls` is false and this function is called from a
-  // hermetic context (ie, `HermeticPyObjectTLS::get_state()` is true), then
-  // nullopt is returned. If `ignore_hermetic_tls` is true, then the hermetic
-  // context is ignored, allowing you to check the interpreter tag of a
-  // nonhermetic PyObject from within a hermetic context. This is necessary
-  // because there are some cases where the deallocator function of a
-  // nonhermetic PyObject is called from within a hermetic context, so it must
-  // be properly treated as a nonhermetic PyObject.
-  //
-  // NB: this lives in header so that we can avoid actually creating the
-  // std::optional
-
-  // @todo alban: I'm not too sure what's going on here, we can probably delete
-  // it but it's worthwhile making sure
-  std::optional<PyObject*> check_pyobj(bool ignore_hermetic_tls = false) const {
-    impl::PyInterpreter* interpreter =
-        pyobj_interpreter_.load(std::memory_order_acquire);
-    if (interpreter == nullptr) {
-      return std::nullopt;
-    }
-
-    if (!ignore_hermetic_tls && c10::impl::HermeticPyObjectTLS::get_state()) {
-      return std::nullopt;
-    } else {
-      return _unchecked_untagged_pyobj();
-    }
+  // interpreter.
+  PyInterpreter* pyobj_interpreter() const {
+    return pyobj_interpreter_.load(std::memory_order_acquire);
  }

-  PyInterpreter& load_pyobj_interpreter() const;
+  PyInterpreter& load_pyobj_interpreter() const {
+    auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
+    TORCH_INTERNAL_ASSERT(
+        interpreter, "cannot access PyObject for Tensor - no interpreter set");
+    return *interpreter;
+  }

-  bool owns_pyobj();
+  PyObject* load_pyobj() const {
+    return pyobj_.load(std::memory_order_acquire);
+  }

-  void set_owns_pyobj(bool b);
+  bool has_unique_reference() const {
+    PyObject* pyobj = load_pyobj();
+    return pyobj != nullptr && load_pyobj_interpreter()->refcnt(pyobj) == 1;
+  }
+
+  void clear() {
+    pyobj_.store(nullptr, std::memory_order_relaxed);
+    pyobj_interpreter_.store(nullptr, std::memory_order_relaxed);
+  }
+
+  // Non thread-safe swap
+  void swap(PyObjectSlot& other) noexcept {
+    PyInterpreter* tmp_interpreter =
+        pyobj_interpreter_.load(std::memory_order_relaxed);
+    pyobj_interpreter_.store(
+        other.pyobj_interpreter_.load(std::memory_order_relaxed),
+        std::memory_order_relaxed);
+    other.pyobj_interpreter_.store(tmp_interpreter, std::memory_order_relaxed);
+
+    PyObject* tmp_pyobj = pyobj_.load(std::memory_order_relaxed);
+    pyobj_.store(
+        other.pyobj_.load(std::memory_order_relaxed),
+        std::memory_order_relaxed);
+    other.pyobj_.store(tmp_pyobj, std::memory_order_relaxed);
+  }

 private:
-  // This field contains the interpreter tag for this object.  See
-  // Note [Python interpreter tag] for general context
-  //
-  // Note [Memory ordering on Python interpreter tag]
-  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  // What memory_order do we need when accessing this atomic?  We don't
-  // need a single total modification order (as provided by
-  // memory_order_seq_cst) as pyobj_interpreter_ is monotonic: it can only
-  // transition from -1 to some positive integer and never changes afterwards.
-  // Because there is only one modification, it trivially already has a total
-  // modification order (e.g., we don't need fences or locked instructions on
-  // x86)
-  //
-  // In fact, one could make a reasonable argument that relaxed reads are OK,
-  // due to the presence of external locking (GIL) to ensure that interactions
-  // with other data structures are still correctly synchronized, so that
-  // we fall in the "Single-Location Data Structures" case as described in
-  // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf
-  // However, on x86, it doesn't matter if I use acquire or relaxed on the load
-  // as I get the same assembly in both cases.  So I just use the more
-  // conservative acquire (which will impede compiler optimizations but I don't
-  // care)
+  // This is now always the global interpreter if the PyObject is set.
+  // Maybe we can remove this field some day...
  std::atomic<PyInterpreter*> pyobj_interpreter_;

-  // This field contains a reference to a PyObject representing this Tensor.
-  // If pyobj is nullptr, when we transfer Tensor to Python, we allocate a new
-  // PyObject for it and set this field.  This field does not have to be
-  // protected by an atomic as it is only allowed to be accessed when you hold
-  // the GIL, or during destruction of the tensor.
-  //
-  // When a PyObject dies, you are obligated to clear this field
-  // (otherwise, you will try to use-after-free the pyobj); this currently
-  // occurs in THPVariable_clear in torch/csrc/autograd/python_variable.cpp
-  //
-  // NB: Ordinarily, this should not be a strong reference, as if the
-  // PyObject owns the Tensor, this would create a reference cycle.
-  // However, sometimes this ownership flips.  To track who owns
-  // who, this has a single pointer tag indicating whether or not the
-  // C++ object owns the PyObject (the common case, zero, means PyObject
-  // owns the C++ object); see _unchecked_untagged_pyobj for raw access
-  // or check_pyobj for checked access.  See references to PyObject
-  // resurrection in torch/csrc/autograd/python_variable.cpp
-  PyObject* pyobj_;
+  // The PyObject representing this Tensor or nullptr. Ownership is managed
+  // by intrusive_ptr. By the time the PyObjectSlot is destroyed, this
+  // reference is already dead.
+  std::atomic<PyObject*> pyobj_;
+
+  friend class torch::utils::PyObjectPreservation;
 };

 } // namespace c10::impl
--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@ -345,6 +345,13 @@ class CUDAAllocator : public DeviceAllocator {
      c10::DeviceIndex device,
      std::shared_ptr<AllocatorState> pps) = 0;
  virtual std::string name() = 0;
+  std::pair<size_t, size_t> getMemoryInfo(c10::DeviceIndex device) override {
+    c10::DeviceGuard device_guard({at::kCUDA, device});
+    size_t free = 0;
+    size_t total = 0;
+    C10_CUDA_CHECK(cudaMemGetInfo(&free, &total));
+    return {free, total};
+  }
 };

 // Allocator object, statically initialized
--- a/c10/test/build.bzl
+++ b/c10/test/build.bzl
@ -66,6 +66,15 @@ def define_targets(rules):
        ],
    )

+    rules.cc_test(
+        name = "util/nofatal_test",
+        srcs = ["util/nofatal_test.cpp"],
+        deps = [
+            "//c10/util:base",
+            "@com_google_googletest//:gtest_main",
+        ],
+    )
+
    rules.cc_test(
        name = "util/ssize_test",
        srcs = ["util/ssize_test.cpp"],
--- a/c10/test/util/nofatal_test.cpp
+++ b/c10/test/util/nofatal_test.cpp
@ -0,0 +1,53 @@
+#include <gtest/gtest.h>
+
+#include <c10/util/Exception.h>
+#include <c10/util/Logging.h>
+
+namespace {
+template <typename T>
+inline void expectThrowsEq(T&& fn, const char* expected_msg) {
+  try {
+    std::forward<T>(fn)();
+  } catch (const c10::Error& e) {
+    EXPECT_TRUE(
+        std::string(e.what_without_backtrace()).find(expected_msg) !=
+        std::string::npos);
+    return;
+  }
+  ADD_FAILURE() << "Expected to throw exception with message \"" << expected_msg
+                << "\" but didn't throw";
+}
+} // namespace
+
+TEST(NofatalTest, TorchCheckComparisons) {
+  // quick make sure that no-op works as expected
+  TORCH_CHECK_EQ(1, 1) << "i am a silly message " << 1;
+  expectThrowsEq(
+      []() { TORCH_CHECK_EQ(1, 2) << "i am a silly message " << 1; },
+      "Check failed: 1 == 2 (1 vs. 2). i am a silly message 1");
+  expectThrowsEq(
+      []() { TORCH_CHECK_NE(2, 2); }, "Check failed: 2 != 2 (2 vs. 2).");
+  expectThrowsEq(
+      []() { TORCH_CHECK_LT(2, 2); }, "Check failed: 2 < 2 (2 vs. 2).");
+  expectThrowsEq(
+      []() { TORCH_CHECK_LE(3, 2); }, "Check failed: 3 <= 2 (3 vs. 2).");
+  expectThrowsEq(
+      []() { TORCH_CHECK_GT(2, 2); }, "Check failed: 2 > 2 (2 vs. 2).");
+  expectThrowsEq(
+      []() { TORCH_CHECK_GE(2, 3); }, "Check failed: 2 >= 3 (2 vs. 3).");
+  expectThrowsEq(
+      []() {
+        void* p = nullptr;
+        TORCH_CHECK_NOTNULL(p);
+      },
+      "Check failed: 'p' must be non NULL.");
+
+#if GTEST_HAS_DEATH_TEST
+#ifndef NDEBUG
+  // if dbg build, DCHECK should result in deth
+  EXPECT_DEATH(TORCH_DCHECK_EQ(1, 2), "Check failed");
+#else
+  TORCH_DCHECK_EQ(1, 2); // no-op
+#endif
+#endif // GTEST_HAS_DEATH_TEST
+}
--- a/c10/util/Exception.h
+++ b/c10/util/Exception.h
@ -702,6 +702,98 @@ namespace c10::detail {
 #define TORCH_CHECK_ARG(cond, argN, ...) \
  TORCH_CHECK(cond, "invalid argument ", argN, ": ", __VA_ARGS__)

+#ifndef FATAL_IF
+#ifdef C10_USE_GLOG
+#define FATAL_IF(condition)                                              \
+  condition ? (void)0                                                    \
+            : ::c10::LoggerVoidify() &                                   \
+          ::c10::MessageLogger(__FILE__, __LINE__, ::google::GLOG_FATAL) \
+              .stream()
+#else
+#define FATAL_IF(condition)            \
+  condition ? (void)0                  \
+            : ::c10::LoggerVoidify() & \
+          ::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_FATAL).stream()
+#endif
+#endif
+
+#ifndef NON_FATAL_IF
+#ifdef C10_USE_GLOG
+#define NON_FATAL_IF(condition)                                \
+  condition ? (void)0                                          \
+            : ::c10::LoggerVoidify() &                         \
+          ::c10::MessageLogger(                                \
+              __FILE__, __LINE__, ::google::GLOG_FATAL, false) \
+              .stream()
+#else
+#define NON_FATAL_IF(condition)                                              \
+  condition ? (void)0                                                        \
+            : ::c10::LoggerVoidify() &                                       \
+          ::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_FATAL, false) \
+              .stream()
+#endif
+#endif
+
+// Binary comparison check macros
+#define TORCH_CHECK_OP(val1, val2, op)                                      \
+  NON_FATAL_IF(((val1)op(val2)))                                            \
+      << "Check failed: " #val1 " " #op " " #val2 " (" << (val1) << " vs. " \
+      << (val2) << "). "
+
+#define TORCH_DCHECK_OP(val1, val2, op)                                       \
+  FATAL_IF(((val1)op(val2))) << "Check failed: " #val1 " " #op " " #val2 " (" \
+                             << (val1) << " vs. " << (val2) << "). "
+
+#define TORCH_CHECK_EQ(val1, val2) TORCH_CHECK_OP(val1, val2, ==)
+#define TORCH_CHECK_NE(val1, val2) TORCH_CHECK_OP(val1, val2, !=)
+#define TORCH_CHECK_LE(val1, val2) TORCH_CHECK_OP(val1, val2, <=)
+#define TORCH_CHECK_LT(val1, val2) TORCH_CHECK_OP(val1, val2, <)
+#define TORCH_CHECK_GE(val1, val2) TORCH_CHECK_OP(val1, val2, >=)
+#define TORCH_CHECK_GT(val1, val2) TORCH_CHECK_OP(val1, val2, >)
+
+// Debug versions of TORCH_CHECK_OP macros
+#ifndef NDEBUG
+#define TORCH_DCHECK_EQ(val1, val2) TORCH_DCHECK_OP(val1, val2, ==)
+#define TORCH_DCHECK_NE(val1, val2) TORCH_DCHECK_OP(val1, val2, !=)
+#define TORCH_DCHECK_LE(val1, val2) TORCH_DCHECK_OP(val1, val2, <=)
+#define TORCH_DCHECK_LT(val1, val2) TORCH_DCHECK_OP(val1, val2, <)
+#define TORCH_DCHECK_GE(val1, val2) TORCH_DCHECK_OP(val1, val2, >=)
+#define TORCH_DCHECK_GT(val1, val2) TORCH_DCHECK_OP(val1, val2, >)
+#else // !NDEBUG
+// Optimized versions - generate no code
+#define TORCH_DCHECK_EQ(val1, val2) \
+  while (false)                     \
+  TORCH_DCHECK_OP(val1, val2, ==)
+#define TORCH_DCHECK_NE(val1, val2) \
+  while (false)                     \
+  TORCH_DCHECK_OP(val1, val2, !=)
+#define TORCH_DCHECK_LE(val1, val2) \
+  while (false)                     \
+  TORCH_DCHECK_OP(val1, val2, <=)
+#define TORCH_DCHECK_LT(val1, val2) \
+  while (false)                     \
+  TORCH_DCHECK_OP(val1, val2, <)
+#define TORCH_DCHECK_GE(val1, val2) \
+  while (false)                     \
+  TORCH_DCHECK_OP(val1, val2, >=)
+#define TORCH_DCHECK_GT(val1, val2) \
+  while (false)                     \
+  TORCH_DCHECK_OP(val1, val2, >)
+#endif // NDEBUG
+
+// Null pointer check macro
+#define TORCH_CHECK_NOTNULL(val) \
+  ::c10::CheckNotNull(__FILE__, __LINE__, #val, (val), false)
+
+#ifndef NDEBUG
+#define TORCH_DCHECK_NOTNULL(val) \
+  ::c10::CheckNotNull(__FILE__, __LINE__, #val, (val), true)
+#else // !NDEBUG
+#define TORCH_DCHECK_NOTNULL(val) \
+  while (false)                   \
+  TORCH_CHECK_NOTNULL(val)
+#endif // NDEBUG
+
 // ----------------------------------------------------------------------------
 // Deprecated macros
 // ----------------------------------------------------------------------------
--- a/c10/util/Logging.cpp
+++ b/c10/util/Logging.cpp
@ -291,6 +291,32 @@ namespace c10 {
 using fLB::FLAGS_logtostderr;
 using fLI::FLAGS_minloglevel;
 using fLI::FLAGS_v;
+
+MessageLogger::MessageLogger(
+    const char* file,
+    int line,
+    int severity,
+    bool exit_on_fatal)
+    : stream_(), severity_(severity), exit_on_fatal_(exit_on_fatal) {}
+
+MessageLogger::~MessageLogger() noexcept(false) {
+  if (severity_ == ::google::GLOG_FATAL) {
+    DealWithFatal();
+  }
+}
+
+std::stringstream& MessageLogger::stream() {
+  return stream_;
+}
+
+void MessageLogger::DealWithFatal() {
+  if (exit_on_fatal_) {
+    LOG(FATAL) << stream_.str();
+  } else {
+    throw c10::Error(stream_.str(), nullptr, nullptr);
+  }
+}
+
 } // namespace c10

 C10_DEFINE_int(
@ -412,17 +438,16 @@ void ShowLogInfoToStderr() {
  FLAGS_caffe2_log_level = GLOG_INFO;
 }

-MessageLogger::MessageLogger(const char* file, int line, int severity)
-    : severity_(severity) {
+MessageLogger::MessageLogger(
+    const char* file,
+    int line,
+    int severity,
+    bool exit_on_fatal)
+    : severity_(severity), exit_on_fatal_(exit_on_fatal) {
  if (severity_ < FLAGS_caffe2_log_level) {
    // Nothing needs to be logged.
    return;
  }
-#ifdef ANDROID
-  tag_ = "native";
-#else // !ANDROID
-  tag_ = "";
-#endif // ANDROID

  time_t rawtime = 0;
  time(&rawtime);
@ -458,7 +483,7 @@ MessageLogger::MessageLogger(const char* file, int line, int severity)
 }

 // Output the contents of the stream to the proper channel on destruction.
-MessageLogger::~MessageLogger() {
+MessageLogger::~MessageLogger() noexcept(false) {
  if (severity_ < FLAGS_caffe2_log_level) {
    // Nothing needs to be logged.
    return;
@ -498,6 +523,18 @@ MessageLogger::~MessageLogger() {
  }
 }

+std::stringstream& MessageLogger::stream() {
+  return stream_;
+}
+
+void MessageLogger::DealWithFatal() {
+  if (exit_on_fatal_) {
+    abort();
+  } else {
+    throw c10::Error(stream_.str(), nullptr, nullptr);
+  }
+}
+
 } // namespace c10

 #endif // !C10_USE_GLOG
--- a/c10/util/intrusive_ptr.h
+++ b/c10/util/intrusive_ptr.h
@ -12,6 +12,10 @@ template <typename, typename...>
 class class_;
 }

+namespace torch::utils {
+class PyObjectPreservation;
+}
+
 namespace c10 {
 class intrusive_ptr_target;
 namespace raw {
@ -33,6 +37,8 @@ constexpr uint64_t kImpracticallyHugeWeakReferenceCount =
 constexpr uint64_t kReferenceCountOne = 1;
 constexpr uint64_t kWeakReferenceCountOne = (kReferenceCountOne << 32);
 constexpr uint64_t kUniqueRef = (kReferenceCountOne | kWeakReferenceCountOne);
+// Indicates whether the object has a PyObject wrapper.
+constexpr uint64_t kHasPyObject = (uint64_t(1) << 63);

 template <class TTarget>
 struct intrusive_target_default_null_type final {
@ -55,7 +61,11 @@ inline uint32_t refcount(uint64_t combined_refcount) {
 }

 inline uint32_t weakcount(uint64_t combined_refcount) {
-  return static_cast<uint32_t>(combined_refcount >> 32);
+  return static_cast<uint32_t>((combined_refcount & ~kHasPyObject) >> 32);
+}
+
+inline bool has_pyobject(uint64_t combined_refcount) {
+  return (combined_refcount & kHasPyObject) != 0;
 }

 // The only requirement for refcount increment is that it happens-before
@ -66,12 +76,6 @@ inline uint64_t atomic_combined_refcount_increment(
  return combined_refcount.fetch_add(inc, std::memory_order_relaxed) + inc;
 }

-inline uint32_t atomic_refcount_increment(
-    std::atomic<uint64_t>& combined_refcount) {
-  return detail::refcount(atomic_combined_refcount_increment(
-      combined_refcount, kReferenceCountOne));
-}
-
 inline uint32_t atomic_weakcount_increment(
    std::atomic<uint64_t>& combined_refcount) {
  return detail::weakcount(atomic_combined_refcount_increment(
@ -99,6 +103,11 @@ inline uint32_t atomic_weakcount_decrement(
      combined_refcount, kWeakReferenceCountOne));
 }

+template <class T, class = void>
+struct TargetTraits {
+  static constexpr bool can_have_pyobject = false;
+};
+
 } // namespace detail

 /**
@ -155,6 +164,23 @@ class C10_API intrusive_ptr_target {
  // we can atomically operate on both at the same time for performance
  // and defined behaviors.
  //
+  // Note [PyObject preservation for Tensor and Storages]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // intrusive_ptr has special support for preserving PyObject wrappers
+  // for TensorImpl and StorageImpl. The most significant bit (kHasPyObject) of
+  // the combined_refcount_ is used to indicate whether the object has a
+  // PyObject wrapper.
+  //
+  //   - The PyObject, if it exists, holds a strong reference to the
+  //     intrusive_ptr_target.
+  //
+  //   - When the refcount goes from 1 to 2, we incref the PyObject.
+  //
+  //   - When the refcount goes from 2 to 1, we decref the PyObject.
+  //
+  // In other words, the intrusive_ptr keeps the PyObject alive as long as there
+  // are other C++ references to the intrusive_ptr_target.
+
  mutable std::atomic<uint64_t> combined_refcount_;
  static_assert(sizeof(std::atomic<uint64_t>) == 8);
  static_assert(alignof(std::atomic<uint64_t>) == 8);
@ -172,6 +198,8 @@ class C10_API intrusive_ptr_target {
  template <typename T>
  friend struct ExclusivelyOwnedTensorTraits;

+  friend class torch::utils::PyObjectPreservation;
+
 protected:
  // protected destructor. We never want to destruct intrusive_ptr_target*
  // directly.
@ -255,6 +283,16 @@ class C10_API intrusive_ptr_target {
   */
  virtual void release_resources() {}

+  /**
+   * These two methods are called when the refcount transitions between one
+   * and two and the object has a PyObject wrapper.
+   */
+  virtual void incref_pyobject() const {}
+  virtual void decref_pyobject() const {}
+  virtual bool try_incref_pyobject() const {
+    return false;
+  }
+
  uint32_t refcount(std::memory_order order = std::memory_order_relaxed) const {
    return detail::refcount(combined_refcount_.load(order));
  }
@ -265,6 +303,15 @@ class C10_API intrusive_ptr_target {
  }
 };

+namespace detail {
+template <>
+struct TargetTraits<c10::intrusive_ptr_target> {
+  // A generic intrusive_ptr<intrusive_ptr_target> may actually be a TensorImpl
+  // or StorageImpl, so we have to allow for PyObject support.
+  static constexpr bool can_have_pyobject = true;
+};
+} // namespace detail
+
 template <class TTarget, class NullType>
 class weak_intrusive_ptr;

@ -314,18 +361,34 @@ class intrusive_ptr final {

  void retain_() {
    if (target_ != NullType::singleton()) {
-      uint32_t new_refcount =
-          detail::atomic_refcount_increment(target_->combined_refcount_);
+      uint64_t combined = detail::atomic_combined_refcount_increment(
+          target_->combined_refcount_, detail::kReferenceCountOne);
+      uint32_t new_refcount = detail::refcount(combined);
      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
          new_refcount != 1,
          "intrusive_ptr: Cannot increase refcount after it reached zero.");
+
+      if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
+        // If the refcount transitioned from 1 to 2, we need to incref the
+        // PyObject. In other words, we need to ensure that the PyObject stays
+        // alive now that we have a C++ reference to this object in addition to
+        // the PyObject itself.
+        if (C10_UNLIKELY(
+                detail::has_pyobject(combined) &&
+                detail::refcount(combined) == 2)) {
+          target_->incref_pyobject();
+        }
+      } else {
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+            !detail::has_pyobject(combined),
+            "TargetTraits indicates that type cannot have PyObject, but refcount has PyObject bit set.");
+      }
    }
  }

  void reset_() noexcept {
    if (target_ != NullType::singleton()) {
-      if (target_->combined_refcount_.load(std::memory_order_acquire) ==
-          detail::kUniqueRef) {
+      if (is_uniquely_owned()) {
        // Both counts are 1, so there are no weak references and
        // we are releasing the last strong reference. No other
        // threads can observe the effects of this target_ deletion
@ -337,9 +400,10 @@ class intrusive_ptr final {

      auto combined_refcount = detail::atomic_combined_refcount_decrement(
          target_->combined_refcount_, detail::kReferenceCountOne);
-      if (detail::refcount(combined_refcount) == 0) {
-        bool should_delete =
-            (combined_refcount == detail::kWeakReferenceCountOne);
+      uint32_t new_refcount = detail::refcount(combined_refcount);
+      bool has_pyobject = detail::has_pyobject(combined_refcount);
+      if (new_refcount == 0) {
+        bool should_delete = detail::weakcount(combined_refcount) == 1;
        // See comment above about weakcount. As long as refcount>0,
        // weakcount is one larger than the actual number of weak references.
        // So we need to decrement it here.
@ -356,6 +420,18 @@ class intrusive_ptr final {
        if (should_delete) {
          delete target_;
        }
+      } else if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
+        // If the refcount transitioned from 2 to 1, we need to decref the
+        // PyObject. In other words, we don't want to keep the PyObject alive if
+        // there are no C++ references to this object other than the PyObject
+        // itself.
+        if (C10_UNLIKELY(has_pyobject && new_refcount == 1)) {
+          target_->decref_pyobject();
+        }
+      } else {
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+            !has_pyobject,
+            "TargetTraits indicates that type cannot have PyObject, but refcount has PyObject bit set.");
      }
    }
  }
@ -522,6 +598,16 @@ class intrusive_ptr final {
    return use_count() == 1;
  }

+  /**
+   * Stronger than unique() in that it must not have any weakrefs as well.
+   */
+  bool is_uniquely_owned() const noexcept {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(target_ != NullType::singleton());
+    uint64_t combined =
+        target_->combined_refcount_.load(std::memory_order_acquire);
+    return (combined & ~detail::kHasPyObject) == detail::kUniqueRef;
+  }
+
  /**
   * Returns an owning (!) pointer to the underlying object and makes the
   * intrusive_ptr instance invalid. That means the refcount is not decreased.
@ -932,6 +1018,7 @@ class weak_intrusive_ptr final {
    if (target_ == NullType::singleton()) {
      return intrusive_ptr<TTarget, NullType>();
    } else {
+      bool increfed = false;
      auto combined_refcount =
          target_->combined_refcount_.load(std::memory_order_relaxed);
      do {
@ -940,12 +1027,31 @@ class weak_intrusive_ptr final {
          // Return nullptr.
          return intrusive_ptr<TTarget, NullType>();
        }
+        if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
+          if (detail::has_pyobject(combined_refcount) &&
+              detail::refcount(combined_refcount) == 1 && !increfed) {
+            // Object has a python wrapper with no other C++ references.
+            // We need to to incref the Python object before we acquire a
+            // strong reference to the C++ object to avoid a situation
+            // where the Python object is deallocated concurrently.
+            if (!target_->try_incref_pyobject()) {
+              return intrusive_ptr<TTarget, NullType>();
+            }
+            increfed = true;
+          }
+        }
      } while (!target_->combined_refcount_.compare_exchange_weak(
          combined_refcount,
          combined_refcount + detail::kReferenceCountOne,
          std::memory_order_acquire,
          std::memory_order_relaxed));

+      if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
+        if (increfed && detail::refcount(combined_refcount) != 1) {
+          target_->decref_pyobject();
+        }
+      }
+
      return intrusive_ptr<TTarget, NullType>(
          target_, raw::DontIncreaseRefcount{});
    }
@ -1060,7 +1166,14 @@ namespace intrusive_ptr {
 // NullType::singleton to this function
 inline void incref(intrusive_ptr_target* self) {
  if (self) {
-    detail::atomic_refcount_increment(self->combined_refcount_);
+    uint64_t combined = detail::atomic_combined_refcount_increment(
+        self->combined_refcount_, detail::kReferenceCountOne);
+
+    if (C10_UNLIKELY(
+            detail::has_pyobject(combined) &&
+            detail::refcount(combined) == 2)) {
+      self->incref_pyobject();
+    }
  }
 }

--- a/c10/util/logging_common.h
+++ b/c10/util/logging_common.h
@ -0,0 +1,74 @@
+#ifndef C10_UTIL_LOGGING_COMMON_H_
+#define C10_UTIL_LOGGING_COMMON_H_
+
+#include <c10/macros/Export.h>
+#include <sstream>
+
+namespace c10 {
+
+// MessageLogger that throws exceptions instead of aborting (glog version)
+// or logs and may abort (non-glog version).
+class C10_API MessageLogger {
+ public:
+  MessageLogger(
+      const char* file,
+      int line,
+      int severity,
+      bool exit_on_fatal = true);
+  ~MessageLogger() noexcept(false);
+
+  // Return the stream associated with the logger object.
+  std::stringstream& stream();
+
+ private:
+  // When there is a fatal log, and fatal == true, we abort
+  // otherwise, we throw.
+  void DealWithFatal();
+
+#if defined(ANDROID) && !defined(C10_USE_GLOG)
+  const char* tag_{"native"};
+#endif
+  std::stringstream stream_;
+  int severity_;
+  bool exit_on_fatal_;
+};
+
+// This class is used to explicitly ignore values in the conditional
+// logging macros. This avoids compiler warnings like "value computed
+// is not used" and "statement has no effect".
+class C10_API LoggerVoidify {
+ public:
+  LoggerVoidify() = default;
+  // This has to be an operator with a precedence lower than << but
+  // higher than ?:
+  void operator&(const std::ostream& s [[maybe_unused]]) {}
+};
+
+// Forward declarations for CheckNotNull functions
+template <typename T>
+T& CheckNotNullCommon(
+    const char* file,
+    int line,
+    const char* names,
+    T& t,
+    bool fatal = true);
+
+template <typename T>
+T* CheckNotNull(
+    const char* file,
+    int line,
+    const char* names,
+    T* t,
+    bool fatal = true);
+
+template <typename T>
+T& CheckNotNull(
+    const char* file,
+    int line,
+    const char* names,
+    T& t,
+    bool fatal = true);
+
+} // namespace c10
+
+#endif // C10_UTIL_LOGGING_COMMON_H_
--- a/c10/util/logging_is_google_glog.h
+++ b/c10/util/logging_is_google_glog.h
@ -47,57 +47,53 @@ INSTANTIATE_FOR_CONTAINER(set)

 #endif

+#include <c10/util/logging_common.h>
 #include <glog/logging.h>

-// Additional macros on top of glog
-#define TORCH_CHECK_EQ(val1, val2) CHECK_EQ(val1, val2)
-#define TORCH_CHECK_NE(val1, val2) CHECK_NE(val1, val2)
-#define TORCH_CHECK_LE(val1, val2) CHECK_LE(val1, val2)
-#define TORCH_CHECK_LT(val1, val2) CHECK_LT(val1, val2)
-#define TORCH_CHECK_GE(val1, val2) CHECK_GE(val1, val2)
-#define TORCH_CHECK_GT(val1, val2) CHECK_GT(val1, val2)
+namespace c10 {

-#ifndef NDEBUG
-#define TORCH_DCHECK_EQ(val1, val2) DCHECK_EQ(val1, val2)
-#define TORCH_DCHECK_NE(val1, val2) DCHECK_NE(val1, val2)
-#define TORCH_DCHECK_LE(val1, val2) DCHECK_LE(val1, val2)
-#define TORCH_DCHECK_LT(val1, val2) DCHECK_LT(val1, val2)
-#define TORCH_DCHECK_GE(val1, val2) DCHECK_GE(val1, val2)
-#define TORCH_DCHECK_GT(val1, val2) DCHECK_GT(val1, val2)
-#else // !NDEBUG
-// These versions generate no code in optimized mode.
-#define TORCH_DCHECK_EQ(val1, val2) \
-  while (false)                     \
-  DCHECK_EQ(val1, val2)
-#define TORCH_DCHECK_NE(val1, val2) \
-  while (false)                     \
-  DCHECK_NE(val1, val2)
-#define TORCH_DCHECK_LE(val1, val2) \
-  while (false)                     \
-  DCHECK_LE(val1, val2)
-#define TORCH_DCHECK_LT(val1, val2) \
-  while (false)                     \
-  DCHECK_LT(val1, val2)
-#define TORCH_DCHECK_GE(val1, val2) \
-  while (false)                     \
-  DCHECK_GE(val1, val2)
-#define TORCH_DCHECK_GT(val1, val2) \
-  while (false)                     \
-  DCHECK_GT(val1, val2)
-#endif // NDEBUG
+[[noreturn]] void ThrowEnforceNotMet(
+    const char* file,
+    const int line,
+    const char* condition,
+    const std::string& msg,
+    const void* caller);

-// Check that a pointer is not null.
-#define TORCH_CHECK_NOTNULL(val) CHECK_NOTNULL(val)
+template <typename T>
+T& CheckNotNullCommon(
+    const char* file,
+    int line,
+    const char* names,
+    T& t,
+    bool fatal) {
+  if (t == nullptr) {
+    MessageLogger(file, line, ::google::GLOG_FATAL, fatal).stream()
+        << "Check failed: '" << names << "' must be non NULL. ";
+  }
+  return t;
+}

-#ifndef NDEBUG
-// Debug only version of TORCH_CHECK_NOTNULL
-#define TORCH_DCHECK_NOTNULL(val) DCHECK_NOTNULL(val)
-#else // !NDEBUG
-// Optimized version - generates no code.
-#define TORCH_DCHECK_NOTNULL(val) \
-  while (false)                   \
-  DCHECK_NOTNULL(val)
-#endif // NDEBUG
+template <typename T>
+T* CheckNotNull(
+    const char* file,
+    int line,
+    const char* names,
+    T* t,
+    bool fatal) {
+  return CheckNotNullCommon(file, line, names, t, fatal);
+}
+
+template <typename T>
+T& CheckNotNull(
+    const char* file,
+    int line,
+    const char* names,
+    T& t,
+    bool fatal) {
+  return CheckNotNullCommon(file, line, names, t, fatal);
+}
+
+} // namespace c10

 // Log with source location information override (to be used in generic
 // warning/error handlers implemented as functions, not macros)
--- a/c10/util/logging_is_not_google_glog.h
+++ b/c10/util/logging_is_not_google_glog.h
@ -13,6 +13,7 @@
 #include <vector>

 #include <c10/util/Flags.h>
+#include <c10/util/logging_common.h>

 const char CAFFE2_SEVERITY_PREFIX[] = "FEWIV";

@ -24,61 +25,40 @@ const int GLOG_ERROR = 2;
 const int GLOG_WARNING = 1;
 const int GLOG_INFO = 0;

-class C10_API MessageLogger {
- public:
-  MessageLogger(const char* file, int line, int severity);
-  ~MessageLogger();
-  // Return the stream associated with the logger object.
-  std::stringstream& stream() {
-    return stream_;
-  }
-
- private:
-  // When there is a fatal log, we simply abort.
-  void DealWithFatal() {
-    abort();
-  }
-
-  const char* tag_;
-  std::stringstream stream_;
-  int severity_;
-};
-
-// This class is used to explicitly ignore values in the conditional
-// logging macros.  This avoids compiler warnings like "value computed
-// is not used" and "statement has no effect".
-class C10_API LoggerVoidify {
- public:
-  LoggerVoidify() = default;
-  // This has to be an operator with a precedence lower than << but
-  // higher than ?:
-  void operator&(const std::ostream& s [[maybe_unused]]) {}
-};
-
-// Log a message and terminate.
-template <class T>
-void LogMessageFatal(const char* file, int line, const T& message) {
-  MessageLogger(file, line, GLOG_FATAL).stream() << message;
-}
-
 // Helpers for TORCH_CHECK_NOTNULL(). Two are necessary to support both raw
 // pointers and smart pointers.
 template <typename T>
-T& CheckNotNullCommon(const char* file, int line, const char* names, T& t) {
+T& CheckNotNullCommon(
+    const char* file,
+    int line,
+    const char* names,
+    T& t,
+    bool fatal) {
  if (t == nullptr) {
-    LogMessageFatal(file, line, std::string(names));
+    MessageLogger(file, line, GLOG_FATAL, fatal).stream()
+        << "Check failed: '" << names << "' must be non NULL. ";
  }
  return t;
 }

 template <typename T>
-T* CheckNotNull(const char* file, int line, const char* names, T* t) {
-  return CheckNotNullCommon(file, line, names, t);
+T* CheckNotNull(
+    const char* file,
+    int line,
+    const char* names,
+    T* t,
+    bool fatal) {
+  return CheckNotNullCommon(file, line, names, t, fatal);
 }

 template <typename T>
-T& CheckNotNull(const char* file, int line, const char* names, T& t) {
-  return CheckNotNullCommon(file, line, names, t);
+T& CheckNotNull(
+    const char* file,
+    int line,
+    const char* names,
+    T& t,
+    bool fatal) {
+  return CheckNotNullCommon(file, line, names, t, fatal);
 }
 } // namespace c10

@ -136,65 +116,6 @@ static_assert(
          ::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_##n).stream()
 #endif // NDEBUG

-#define TORCH_CHECK_OP(val1, val2, op)                                        \
-  FATAL_IF(((val1)op(val2))) << "Check failed: " #val1 " " #op " " #val2 " (" \
-                             << (val1) << " vs. " << (val2) << ") "
-
-// TORCH_CHECK_OP macro definitions
-#define TORCH_CHECK_EQ(val1, val2) TORCH_CHECK_OP(val1, val2, ==)
-#define TORCH_CHECK_NE(val1, val2) TORCH_CHECK_OP(val1, val2, !=)
-#define TORCH_CHECK_LE(val1, val2) TORCH_CHECK_OP(val1, val2, <=)
-#define TORCH_CHECK_LT(val1, val2) TORCH_CHECK_OP(val1, val2, <)
-#define TORCH_CHECK_GE(val1, val2) TORCH_CHECK_OP(val1, val2, >=)
-#define TORCH_CHECK_GT(val1, val2) TORCH_CHECK_OP(val1, val2, >)
-
-#ifndef NDEBUG
-// Debug only versions of TORCH_CHECK_OP macros.
-#define TORCH_DCHECK_EQ(val1, val2) TORCH_CHECK_OP(val1, val2, ==)
-#define TORCH_DCHECK_NE(val1, val2) TORCH_CHECK_OP(val1, val2, !=)
-#define TORCH_DCHECK_LE(val1, val2) TORCH_CHECK_OP(val1, val2, <=)
-#define TORCH_DCHECK_LT(val1, val2) TORCH_CHECK_OP(val1, val2, <)
-#define TORCH_DCHECK_GE(val1, val2) TORCH_CHECK_OP(val1, val2, >=)
-#define TORCH_DCHECK_GT(val1, val2) TORCH_CHECK_OP(val1, val2, >)
-#else // !NDEBUG
-// These versions generate no code in optimized mode.
-#define TORCH_DCHECK_EQ(val1, val2) \
-  while (false)                     \
-  TORCH_CHECK_OP(val1, val2, ==)
-#define TORCH_DCHECK_NE(val1, val2) \
-  while (false)                     \
-  TORCH_CHECK_OP(val1, val2, !=)
-#define TORCH_DCHECK_LE(val1, val2) \
-  while (false)                     \
-  TORCH_CHECK_OP(val1, val2, <=)
-#define TORCH_DCHECK_LT(val1, val2) \
-  while (false)                     \
-  TORCH_CHECK_OP(val1, val2, <)
-#define TORCH_DCHECK_GE(val1, val2) \
-  while (false)                     \
-  TORCH_CHECK_OP(val1, val2, >=)
-#define TORCH_DCHECK_GT(val1, val2) \
-  while (false)                     \
-  TORCH_CHECK_OP(val1, val2, >)
-#endif // NDEBUG
-
-// Check that a pointer is not null.
-#define TORCH_CHECK_NOTNULL(val) \
-  ::c10::CheckNotNull(           \
-      __FILE__, __LINE__, "Check failed: '" #val "' Must be non NULL", (val))
-
-#ifndef NDEBUG
-// Debug only version of TORCH_CHECK_NOTNULL
-#define TORCH_DCHECK_NOTNULL(val) \
-  ::c10::CheckNotNull(            \
-      __FILE__, __LINE__, "Check failed: '" #val "' Must be non NULL", (val))
-#else // !NDEBUG
-// Optimized version - generates no code.
-#define TORCH_DCHECK_NOTNULL(val) \
-  while (false)                   \
-  TORCH_CHECK_NOTNULL(val)
-#endif // NDEBUG
-
 // ---------------------- Support for std objects --------------------------
 // These are adapted from glog to support a limited set of logging capability
 // for STL objects.
--- a/c10/xpu/XPUCachingAllocator.cpp
+++ b/c10/xpu/XPUCachingAllocator.cpp
@ -926,15 +926,14 @@ class DeviceCachingAllocator {
          (release_cached_blocks() && alloc_block(params, true));
    }
    if (!block_found) {
-      c10::xpu::DeviceProp device_prop;
-      c10::xpu::get_device_properties(&device_prop, device);
-      auto device_total = device_prop.global_mem_size;
+      const auto& raw_device = c10::xpu::get_raw_device(device);
+      const auto device_total =
+          raw_device.get_info<sycl::info::device::global_mem_size>();
      // Estimate the available device memory when the SYCL runtime does not
      // support the corresponding aspect (ext_intel_free_memory).
-      size_t device_free = device_prop.global_mem_size -
+      size_t device_free = device_total -
          stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)]
              .current;
-      auto& raw_device = c10::xpu::get_raw_device(device);
      // TODO: Remove the aspect check once the SYCL runtime bug is fixed on
      // affected devices.
      if (raw_device.has(sycl::aspect::ext_intel_free_memory)) {
@ -1052,21 +1051,37 @@ class DeviceCachingAllocator {
    }
  }

+  std::pair<size_t, size_t> getMemoryInfo() {
+    const auto& device = c10::xpu::get_raw_device(device_index);
+    const size_t total = device.get_info<sycl::info::device::global_mem_size>();
+    TORCH_CHECK(
+        device.has(sycl::aspect::ext_intel_free_memory),
+        "The device (",
+        device.get_info<sycl::info::device::name>(),
+        ") doesn't support querying the available free memory. ",
+        "You can file an issue at https://github.com/pytorch/pytorch/issues ",
+        "to help us prioritize its implementation.");
+    const size_t free =
+        device.get_info<sycl::ext::intel::info::device::free_memory>();
+    return {free, total};
+  }
+
  double getMemoryFraction() {
    if (!set_fraction) {
      return 1.0;
    }

-    c10::xpu::DeviceProp device_prop;
-    c10::xpu::get_device_properties(&device_prop, device_index);
+    const auto device_total =
+        xpu::get_raw_device(device_index)
+            .get_info<sycl::info::device::global_mem_size>();
    return static_cast<double>(allowed_memory_maximum) /
-        static_cast<double>(device_prop.global_mem_size);
+        static_cast<double>(device_total);
  }

  void setMemoryFraction(double fraction) {
-    c10::xpu::DeviceProp device_prop;
-    c10::xpu::get_device_properties(&device_prop, device_index);
-    auto device_total = device_prop.global_mem_size;
+    const auto device_total =
+        xpu::get_raw_device(device_index)
+            .get_info<sycl::info::device::global_mem_size>();
    allowed_memory_maximum = static_cast<size_t>(fraction * device_total);
    set_fraction = true;
  }
@ -1240,6 +1255,11 @@ class XPUAllocator : public DeviceAllocator {
        c10::xpu::get_raw_device(dev_to_access));
  }

+  std::pair<size_t, size_t> getMemoryInfo(DeviceIndex device) override {
+    assertValidDevice(device);
+    return device_allocators[device]->getMemoryInfo();
+  }
+
  double getMemoryFraction(DeviceIndex device) {
    assertValidDevice(device);
    return device_allocators[device]->getMemoryFraction();
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -1941,6 +1941,7 @@ if(BUILD_TEST)
    foreach(test_src ${Caffe2_XPU_TEST_SRCS})
      get_filename_component(test_name ${test_src} NAME_WE)
      add_executable(${test_name} "${test_src}")
+      torch_compile_options(${test_name})
      target_link_libraries(${test_name} torch_library gtest_main)
      target_include_directories(${test_name} PRIVATE $<INSTALL_INTERFACE:include>)
      target_include_directories(${test_name} PRIVATE ${Caffe2_CPU_INCLUDE})
--- a/caffe2/perfkernels/batch_box_cox_vec.h
+++ b/caffe2/perfkernels/batch_box_cox_vec.h
@ -73,6 +73,19 @@ void box_cox_zero_lambda(
  }
 }

+template <typename T>
+at::vec::Vectorized<T> box_cox_nonzero_lambda_impl(
+    at::vec::Vectorized<T> data,
+    at::vec::Vectorized<T> lambda1,
+    at::vec::Vectorized<T> lambda2,
+    at::vec::Vectorized<T> k_eps) {
+  auto sum = data + lambda2;
+  auto max = at::vec::max(sum, k_eps);
+  auto lambda_over_1 = at::vec::fast_recieprocal(lambda1);
+  auto pow = max.pow(lambda1);
+  return at::vec::fmsub(pow, lambda_over_1, lambda_over_1);
+}
+
 template <typename T>
 void box_cox_nonzero_lambda(
    int64_t D,
@ -88,21 +101,18 @@ void box_cox_nonzero_lambda(
  auto k_eps_vec = Vec(k_eps);
  for(; j + VLEN < D; j += VLEN) {
    auto data = Vec::loadu(data_ptr + j);
-    auto lambda2 = Vec::loadu(lambda2_ptr + j);
-    auto sum = data + lambda2;
-    auto max = at::vec::max(sum, k_eps_vec);
    auto lambda1 = Vec::loadu(lambda1_ptr + j);
-    auto lambda_over_1 = at::vec::fast_recieprocal(lambda1);
-    auto pow = max.pow(lambda1);
-    auto res = at::vec::fmsub(pow, lambda_over_1, lambda_over_1);
+    auto lambda2 = Vec::loadu(lambda2_ptr + j);
+    auto res = box_cox_nonzero_lambda_impl(data, lambda1, lambda2, k_eps_vec);
    res.store(out + j);
  }
-  for ( ;j < D; ++j) {
-    auto sum = data_ptr[j] + lambda2_ptr[j];
-    auto max = std::max(sum, k_eps);
-    auto lambda_over_1 = at::vec::fast_recieprocal(lambda1_ptr[j]);
-    auto pow = std::pow(max, lambda1_ptr[j]);
-    out[j] = pow * lambda_over_1 - lambda_over_1;
+  if (j < D) {
+    auto remaining = D - j;
+    auto data = Vec::loadu(data_ptr + j, remaining);
+    auto lambda1 = Vec::loadu(lambda1_ptr + j, remaining);
+    auto lambda2 = Vec::loadu(lambda2_ptr + j, remaining);
+    auto res = box_cox_nonzero_lambda_impl(data, lambda1, lambda2, k_eps_vec);
+    res.store(out + j, remaining);
  }
 }
 #else
--- a/cmake/public/utils.cmake
+++ b/cmake/public/utils.cmake
@ -478,6 +478,7 @@ function(torch_update_find_cuda_flags)
 endfunction()

 include(CheckCXXCompilerFlag)
+include(CheckCCompilerFlag)
 include(CheckLinkerFlag)

 ##############################################################################
@ -501,6 +502,24 @@ function(append_cxx_flag_if_supported flag outputvar)
    endif()
 endfunction()

+function(append_c_flag_if_supported flag outputvar)
+    string(TOUPPER "HAS${flag}" _FLAG_NAME)
+    string(REGEX REPLACE "[=-]" "_" _FLAG_NAME "${_FLAG_NAME}")
+
+    # GCC silences unknown -Wno-XXX flags, so test the corresponding -WXXX.
+    if(CMAKE_C_COMPILER_ID STREQUAL "GNU")
+        string(REGEX REPLACE "^Wno-" "W" new_flag "${flag}")
+    else()
+        set(new_flag "${flag}")
+    endif()
+
+    check_c_compiler_flag("${new_flag}" ${_FLAG_NAME})
+    if(${_FLAG_NAME})
+        string(APPEND ${outputvar} " ${flag}")
+        set(${outputvar} "${${outputvar}}" PARENT_SCOPE)
+    endif()
+endfunction()
+
 function(target_compile_options_if_supported target flag)
  set(_compile_options "")
  append_cxx_flag_if_supported("${flag}" _compile_options)
--- a/docs/source/accelerator.md
+++ b/docs/source/accelerator.md
@ -40,6 +40,7 @@
    :nosignatures:

     empty_cache
+     get_memory_info
     max_memory_allocated
     max_memory_reserved
     memory_allocated
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -382,20 +382,6 @@ coverage_ignore_functions = [
    # torch.ao.quantization.backend_config.tensorrt
    "get_tensorrt_backend_config",
    "get_tensorrt_backend_config_dict",
-    # torch.ao.quantization.backend_config.utils
-    "entry_to_pretty_str",
-    "get_fused_module_classes",
-    "get_fuser_method_mapping",
-    "get_fusion_pattern_to_extra_inputs_getter",
-    "get_fusion_pattern_to_root_node_getter",
-    "get_module_to_qat_module",
-    "get_pattern_to_dtype_configs",
-    "get_pattern_to_input_type_to_index",
-    "get_qat_module_classes",
-    "get_root_module_to_quantized_reference_module",
-    "pattern_to_human_readable",
-    "remove_boolean_dispatch_from_name",
-    # torch.ao.quantization.backend_config.x86
    "get_x86_backend_config",
    # torch.ao.quantization.fuse_modules
    "fuse_known_modules",
@ -426,25 +412,6 @@ coverage_ignore_functions = [
    "insert_observers_for_model",
    "prepare",
    "propagate_dtypes_for_known_nodes",
-    # torch.ao.quantization.fx.utils
-    "all_node_args_except_first",
-    "all_node_args_have_no_tensors",
-    "assert_and_get_unique_device",
-    "collect_producer_nodes",
-    "create_getattr_from_value",
-    "create_node_from_old_node_preserve_meta",
-    "get_custom_module_class_keys",
-    "get_linear_prepack_op_for_dtype",
-    "get_new_attr_name_with_prefix",
-    "get_non_observable_arg_indexes_and_types",
-    "get_qconv_prepack_op",
-    "get_skipped_module_name_and_classes",
-    "graph_module_from_producer_nodes",
-    "maybe_get_next_module",
-    "node_arg_is_bias",
-    "node_arg_is_weight",
-    "return_arg_list",
-    # torch.ao.quantization.pt2e.graph_utils
    "bfs_trace_with_node_process",
    "find_sequential_partitions",
    "get_equivalent_types",
@ -860,80 +827,10 @@ coverage_ignore_functions = [
    "get_latency_of_one_partition",
    "get_latency_of_partitioned_graph",
    "get_partition_to_latency_mapping",
-    # torch.fx.experimental.proxy_tensor
-    "decompose",
-    "disable_autocast_cache",
-    "disable_proxy_modes_tracing",
-    "dispatch_trace",
-    "extract_val",
-    "fake_signature",
-    "fetch_sym_proxy",
-    "fetch_object_proxy",
-    "get_innermost_proxy_mode",
-    "get_isolated_graphmodule",
-    "get_proxy_slot",
-    "get_torch_dispatch_modes",
-    "has_proxy_slot",
-    "is_sym_node",
-    "maybe_handle_decomp",
-    "proxy_call",
-    "set_meta",
-    "set_original_aten_op",
-    "set_proxy_slot",
-    "snapshot_fake",
-    "thunkify",
-    "track_tensor",
-    "track_tensor_tree",
-    "wrap_key",
-    "wrapper_and_args_for_make_fx",
-    # torch.fx.experimental.recording
    "record_shapeenv_event",
    "replay_shape_env_events",
    "shape_env_check_state_equal",
-    # torch.fx.experimental.sym_node
-    "ceil_impl",
-    "floor_ceil_helper",
-    "floor_impl",
-    "method_to_operator",
-    "sympy_is_channels_last_contiguous_2d",
-    "sympy_is_channels_last_contiguous_3d",
-    "sympy_is_channels_last_strides_2d",
-    "sympy_is_channels_last_strides_3d",
-    "sympy_is_channels_last_strides_generic",
-    "sympy_is_contiguous",
-    "sympy_is_contiguous_generic",
-    "to_node",
-    "wrap_node",
    "sym_sqrt",
-    # torch.fx.experimental.symbolic_shapes
-    "bind_symbols",
-    "cast_symbool_to_symint_guardless",
-    "create_contiguous",
-    "error",
-    "eval_guards",
-    "eval_is_non_overlapping_and_dense",
-    "expect_true",
-    "find_symbol_binding_fx_nodes",
-    "free_symbols",
-    "free_unbacked_symbols",
-    "fx_placeholder_targets",
-    "fx_placeholder_vals",
-    "guard_bool",
-    "guard_float",
-    "guard_int",
-    "guard_scalar",
-    "has_hint",
-    "has_symbolic_sizes_strides",
-    "is_channels_last_contiguous_2d",
-    "is_channels_last_contiguous_3d",
-    "is_channels_last_strides_2d",
-    "is_channels_last_strides_3d",
-    "is_contiguous",
-    "is_non_overlapping_and_dense_indicator",
-    "is_nested_int",
-    "is_symbol_binding_fx_node",
-    "is_symbolic",
-    # torch.fx.experimental.unification.core
    "reify",
    # torch.fx.experimental.unification.match
    "edge",
@ -971,24 +868,6 @@ coverage_ignore_functions = [
    "reverse_dict",
    # torch.fx.experimental.unification.multipledispatch.variadic
    "isvariadic",
-    # torch.fx.experimental.unification.unification_tools
-    "assoc",
-    "assoc_in",
-    "dissoc",
-    "first",
-    "get_in",
-    "getter",
-    "groupby",
-    "itemfilter",
-    "itemmap",
-    "keyfilter",
-    "keymap",
-    "merge",
-    "merge_with",
-    "update_in",
-    "valfilter",
-    "valmap",
-    # torch.fx.experimental.unification.utils
    "freeze",
    "hashable",
    "raises",
--- a/docs/source/fx.experimental.md
+++ b/docs/source/fx.experimental.md
@ -12,6 +12,37 @@ These APIs are experimental and subject to change without notice.
 .. autoclass:: torch.fx.experimental.sym_node.DynamicInt
 ```

+## torch.fx.experimental.sym_node
+
+```{eval-rst}
+.. currentmodule:: torch.fx.experimental.sym_node
+```
+
+```{eval-rst}
+.. automodule:: torch.fx.experimental.sym_node
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    is_channels_last_contiguous_2d
+    is_channels_last_contiguous_3d
+    is_channels_last_strides_2d
+    is_channels_last_strides_3d
+    is_contiguous
+    is_non_overlapping_and_dense_indicator
+    method_to_operator
+    sympy_is_channels_last_contiguous_2d
+    sympy_is_channels_last_contiguous_3d
+    sympy_is_channels_last_strides_2d
+    sympy_is_channels_last_strides_3d
+    sympy_is_channels_last_strides_generic
+    sympy_is_contiguous
+    sympy_is_contiguous_generic
+```
+
 ## torch.fx.experimental.symbolic_shapes

 ```{eval-rst}
@ -69,6 +100,25 @@ These APIs are experimental and subject to change without notice.
    rebind_unbacked
    resolve_unbacked_bindings
    is_accessor_node
+    cast_symbool_to_symint_guardless
+    create_contiguous
+    error
+    eval_guards
+    eval_is_non_overlapping_and_dense
+    find_symbol_binding_fx_nodes
+    free_symbols
+    free_unbacked_symbols
+    fx_placeholder_targets
+    fx_placeholder_vals
+    guard_bool
+    guard_float
+    guard_int
+    guard_scalar
+    has_hint
+    has_symbolic_sizes_strides
+    is_nested_int
+    is_symbol_binding_fx_node
+    is_symbolic
 ```

 ## torch.fx.experimental.proxy_tensor
@ -91,4 +141,46 @@ These APIs are experimental and subject to change without notice.
    get_proxy_mode
    maybe_enable_thunkify
    maybe_disable_thunkify
+    decompose
+    disable_autocast_cache
+    disable_proxy_modes_tracing
+    extract_val
+    fake_signature
+    fetch_object_proxy
+    fetch_sym_proxy
+    has_proxy_slot
+    is_sym_node
+    maybe_handle_decomp
+    proxy_call
+    set_meta
+    set_original_aten_op
+    set_proxy_slot
+    snapshot_fake
 ```
+
+## torch.fx.experimental.unification.unification_tools
+
+```{eval-rst}
+.. currentmodule:: torch.fx.experimental.unification.unification_tools
+```
+
+```{eval-rst}
+.. automodule:: torch.fx.experimental.unification.unification_tools
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    assoc
+    assoc_in
+    dissoc
+    first
+    keyfilter
+    keymap
+    merge
+    merge_with
+    update_in
+    valfilter
+    valmap
--- a/docs/source/fx.md
+++ b/docs/source/fx.md
@ -1134,7 +1134,6 @@ The set of leaf modules can be customized by overriding
 .. py:module:: torch.fx.experimental.refinement_types
 .. py:module:: torch.fx.experimental.rewriter
 .. py:module:: torch.fx.experimental.schema_type_annotation
-.. py:module:: torch.fx.experimental.sym_node
 .. py:module:: torch.fx.experimental.unification.core
 .. py:module:: torch.fx.experimental.unification.dispatch
 .. py:module:: torch.fx.experimental.unification.match
@ -1144,7 +1143,6 @@ The set of leaf modules can be customized by overriding
 .. py:module:: torch.fx.experimental.unification.multipledispatch.dispatcher
 .. py:module:: torch.fx.experimental.unification.multipledispatch.utils
 .. py:module:: torch.fx.experimental.unification.multipledispatch.variadic
-.. py:module:: torch.fx.experimental.unification.unification_tools
 .. py:module:: torch.fx.experimental.unification.utils
 .. py:module:: torch.fx.experimental.unification.variable
 .. py:module:: torch.fx.experimental.unify_refinements
--- a/docs/source/quantization-support.md
+++ b/docs/source/quantization-support.md
@ -134,6 +134,23 @@ Quantization to work with this as well.
    ObservationType
 ```

+## torch.ao.quantization.backend_config.utils
+```{eval-rst}
+.. currentmodule:: torch.ao.quantization.backend_config.utils
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    entry_to_pretty_str
+    pattern_to_human_readable
+    remove_boolean_dispatch_from_name
+
+```
+
 ## torch.ao.quantization.fx.custom_config

 This module contains a few CustomConfig classes that's used in both eager mode and FX graph mode quantization
@ -154,6 +171,30 @@ This module contains a few CustomConfig classes that's used in both eager mode a
    StandaloneModuleConfigEntry
 ```

+## torch.ao.quantization.fx.utils
+
+```{eval-rst}
+.. currentmodule:: torch.ao.quantization.fx.utils
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    all_node_args_except_first
+    all_node_args_have_no_tensors
+    collect_producer_nodes
+    create_getattr_from_value
+    create_node_from_old_node_preserve_meta
+    graph_module_from_producer_nodes
+    maybe_get_next_module
+    node_arg_is_bias
+    node_arg_is_weight
+    return_arg_list
+```
+
 ## torch.ao.quantization.quantizer

 ```{eval-rst}
--- a/pyproject.toml
+++ b/pyproject.toml
@ -172,9 +172,9 @@ ignore = [
    "SIM102", "SIM103", "SIM112", # flake8-simplify code styles
    "SIM105", # these ignores are from flake8-simplify. please fix or ignore with commented reason
    "SIM108", # SIM108 ignored because we prefer if-else-block instead of ternary expression
-    "SIM110",
+    "SIM110", # Checks for for loops that can be replaced with a builtin function, like any or all.
    "SIM114", # Combine `if` branches using logical `or` operator
-    "SIM115",
+    "SIM115", # Checks for cases where files are opened without using a context manager.
    "SIM116", # Disable Use a dictionary instead of consecutive `if` statements
    "SIM117",
    "SIM118",
@ -184,7 +184,6 @@ ignore = [
    "TC006",
    # TODO: Remove Python-3.10 specific suppressions
    "B905",
-    "UP035",
 ]
 select = [
    "B",
@ -261,6 +260,7 @@ select = [
    "TRY401", # verbose-log-message
    "UP",
    "YTT",
+    "S101",
 ]

 [tool.ruff.lint.pyupgrade]
@ -340,6 +340,39 @@ keep-runtime-typing = true
 "tools/linter/**" = [
    "LOG015" # please fix
 ]
+"benchmarks/**" = [
+    "S101"
+]
+"test/**" = [
+    "S101"
+]
+"torchgen/**" = [
+    "S101"
+]
+"torch/**" = [
+    "S101"
+]
+"tools/**" = [
+    "S101"
+]
+"setup.py" = [
+    "S101"
+]
+"functorch/**" = [
+    "S101"
+]
+"docs/**" = [
+    "S101"
+]
+"android/**" = [
+    "S101"
+]
+".github/**" = [
+    "S101"
+]
+".ci/**" = [
+    "S101"
+]

 [tool.codespell]
 ignore-words = "tools/linter/dictionary.txt"
--- a/Show More
+++ b/Show More