Add torch::stable::Device (#166579 )

Prior to this PR, the IValue <-> StableIValue conversion for `DeviceObjType` (aka c10::Device) was to pack it into the leading bits of the StableIValue (which is a uint64_t) After this PR, the IValue <-> StableIValue conversion for `DeviceObjType` expects DeviceType to be packed into the upper 32 bits of StableIValue and DeviceIndex to be packed into the lower 32 bits Pull Request resolved: https://github.com/pytorch/pytorch/pull/166579 Approved by: https://github.com/janeyx99
[ez] Remove spammy deprecation log (#167470 )
2025-11-13 03:14:45 +08:00 · 2025-11-10 22:21:14 +00:00 · 2025-11-10 21:49:23 +00:00 · 2025-11-10 21:47:53 +00:00 · 2025-11-10 21:45:45 +00:00 · 2025-11-10 21:06:37 +00:00
354 changed files with 7741 additions and 4603 deletions
--- a/.ci/docker/almalinux/build.sh
+++ b/.ci/docker/almalinux/build.sh
@ -36,11 +36,7 @@ case ${DOCKER_TAG_PREFIX} in
    ;;
  rocm*)
    BASE_TARGET=rocm
-    PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-    # add gfx950, gfx115x conditionally starting in ROCm 7.0
-    if [[ "$ROCM_VERSION" == *"7.0"* ]]; then
-        PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
-    fi
+    PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151"
    EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
    ;;
  *)
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -260,6 +260,12 @@ case "$tag" in
    HALIDE=yes
    TRITON=yes
    ;;
+  pytorch-linux-jammy-cuda12.8-py3.12-pallas)
+    CUDA_VERSION=12.8.1
+    ANACONDA_PYTHON_VERSION=3.12
+    GCC_VERSION=11
+    PALLAS=yes
+    ;;
  pytorch-linux-jammy-py3.12-triton-cpu)
    CUDA_VERSION=12.6
    ANACONDA_PYTHON_VERSION=3.12
@ -381,6 +387,7 @@ docker build \
       --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
       --build-arg "EXECUTORCH=${EXECUTORCH}" \
       --build-arg "HALIDE=${HALIDE}" \
+       --build-arg "PALLAS=${PALLAS}" \
       --build-arg "XPU_VERSION=${XPU_VERSION}" \
       --build-arg "UNINSTALL_DILL=${UNINSTALL_DILL}" \
       --build-arg "ACL=${ACL:-}" \
--- a/.ci/docker/ci_commit_pins/jax.txt
+++ b/.ci/docker/ci_commit_pins/jax.txt
@ -0,0 +1 @@
+0.8.0
--- a/.ci/docker/common/install_jax.sh
+++ b/.ci/docker/common/install_jax.sh
@ -0,0 +1,40 @@
+#!/bin/bash
+
+set -ex
+
+source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
+
+# Get the pinned JAX version (same for all CUDA versions)
+JAX_VERSION=$(get_pinned_commit /ci_commit_pins/jax)
+
+function install_jax_12() {
+  echo "Installing JAX ${JAX_VERSION} with CUDA 12 support"
+  pip_install "jax[cuda12]==${JAX_VERSION}" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+
+  # Verify installation
+  python -c "import jax"  # check for errors
+  echo "JAX ${JAX_VERSION} installation completed successfully for CUDA 12"
+}
+
+function install_jax_13() {
+  echo "Installing JAX ${JAX_VERSION} with CUDA 13 support"
+  pip_install "jax[cuda13]==${JAX_VERSION}" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+
+  # Verify installation
+  python -c "import jax"  # check for errors
+  echo "JAX ${JAX_VERSION} installation completed successfully for CUDA 13"
+}
+
+# idiomatic parameter and option handling in sh
+while test $# -gt 0
+do
+    case "$1" in
+    12.4|12.6|12.6.*|12.8|12.8.*|12.9|12.9.*) install_jax_12;
+        ;;
+    13.0|13.0.*) install_jax_13;
+        ;;
+    *) echo "bad argument $1"; exit 1
+        ;;
+    esac
+    shift
+done
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -49,11 +49,7 @@ case ${DOCKER_TAG_PREFIX} in
        fi
        BASE_TARGET=rocm
        GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
-        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-        # add gfx950, gfx115x conditionally starting in ROCm 7.0
-        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
-            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
-        fi
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151"
        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
        ;;
    *)
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -87,11 +87,7 @@ case ${image} in
        MANY_LINUX_VERSION="2_28"
        DEVTOOLSET_VERSION="11"
        GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
-        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-        # add gfx950, gfx115x conditionally starting in ROCm 7.0
-        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
-            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950;gfx1150;gfx1151"
-        fi
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151"
        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
        ;;
    manylinux2_28-builder:xpu)
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -143,6 +143,15 @@ COPY ci_commit_pins/halide.txt halide.txt
 RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
 RUN rm install_halide.sh common_utils.sh halide.txt

+ARG PALLAS
+ARG CUDA_VERSION
+# Install JAX with CUDA support (for Pallas)
+COPY ./common/install_jax.sh install_jax.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./ci_commit_pins/jax.txt /ci_commit_pins/jax.txt
+RUN if [ -n "${PALLAS}" ]; then bash ./install_jax.sh ${CUDA_VERSION}; fi
+RUN rm -f install_jax.sh common_utils.sh /ci_commit_pins/jax.txt
+
 ARG ONNX
 # Install ONNX dependencies
 COPY ./common/install_onnx.sh ./common/common_utils.sh ./
--- a/.ci/lumen_cli/cli/lib/common/cli_helper.py
+++ b/.ci/lumen_cli/cli/lib/common/cli_helper.py
@ -8,9 +8,11 @@ from abc import ABC, abstractmethod


 try:
-    from typing import Any, Callable, Required, TypedDict  # Python 3.11+
+    from collections.abc import Callable  # Python 3.11+
+    from typing import Any, Required, TypedDict
 except ImportError:
-    from typing import Any, Callable, TypedDict
+    from collections.abc import Callable
+    from typing import Any, TypedDict

    from typing_extensions import Required  # Fallback for Python <3.11

--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -168,14 +168,16 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/compiler/latest/env/vars.sh
  # shellcheck disable=SC1091
+  source /opt/intel/oneapi/umf/latest/env/vars.sh
+  # shellcheck disable=SC1091
  source /opt/intel/oneapi/ccl/latest/env/vars.sh
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/mpi/latest/env/vars.sh
+  # shellcheck disable=SC1091
+  source /opt/intel/oneapi/pti/latest/env/vars.sh
  # Enable XCCL build
  export USE_XCCL=1
  export USE_MPI=0
-  # XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA
-  export USE_KINETO=0
  export TORCH_XPU_ARCH_LIST=pvc
 fi

--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -208,6 +208,8 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  source /opt/intel/oneapi/ccl/latest/env/vars.sh
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/mpi/latest/env/vars.sh
+  # shellcheck disable=SC1091
+  source /opt/intel/oneapi/pti/latest/env/vars.sh
  # Check XPU status before testing
  timeout 30 xpu-smi discovery || true
 fi
@ -824,6 +826,11 @@ test_inductor_halide() {
  assert_git_not_dirty
 }

+test_inductor_pallas() {
+  python test/run_test.py --include inductor/test_pallas.py --verbose
+  assert_git_not_dirty
+}
+
 test_inductor_triton_cpu() {
  python test/run_test.py --include inductor/test_triton_cpu_backend.py inductor/test_torchinductor_strided_blocks.py --verbose
  assert_git_not_dirty
@ -1724,6 +1731,8 @@ elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
  test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
  test_inductor_halide
+elif [[ "${TEST_CONFIG}" == *inductor-pallas* ]]; then
+  test_inductor_pallas
 elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then
  test_inductor_triton_cpu
 elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@ -1 +1 @@
-ca2212438fdd8ce29b66999ed70ed54b0f9372d1
+ccb801b88af136454798b945175c4c87e636ac33
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-c8b09f5f77d6bf6fb7ed7a9aa83e5d8156b3a5e9
+e4d25697f9dc5eedaf8f0a5bf085c62c5455a53a
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -138,7 +138,8 @@
 - test/test_matmul_cuda.py
 - test/test_scaled_matmul_cuda.py
 - test/inductor/test_fp8.py
- aten/src/ATen/native/cuda/Blas.cpp
+- aten/src/ATen/native/cuda/*Blas.cpp
+- aten/src/ATen/cuda/CUDA*Blas.*
 - torch/**/*cublas*
 - torch/_inductor/kernel/mm.py
 - test/inductor/test_max_autotune.py
@ -148,7 +149,8 @@
 - test/test_matmul_cuda.py
 - test/test_scaled_matmul_cuda.py
 - test/inductor/test_fp8.py
- aten/src/ATen/native/cuda/Blas.cpp
+- aten/src/ATen/native/cuda/*Blas.cpp
+- aten/src/ATen/cuda/CUDA*Blas.*
 - torch/**/*cublas*
 - torch/_inductor/kernel/mm.py
 - test/inductor/test_max_autotune.py
@ -158,7 +160,21 @@
 - test/test_matmul_cuda.py
 - test/test_scaled_matmul_cuda.py
 - test/inductor/test_fp8.py
- aten/src/ATen/native/cuda/Blas.cpp
+- aten/src/ATen/native/cuda/*Blas.cpp
+- aten/src/ATen/cuda/CUDA*Blas.*
 - torch/_inductor/kernel/mm.py
 - test/inductor/test_max_autotune.py
 - third_party/fbgemm
+
+"ciflow/mps":
+- aten/src/ATen/mps/**
+- aten/src/ATen/native/mps/**
+- torch/_inductor/codegen/mps.py
+- test/test_mps.py
+- test/inductor/test_mps_basic.py
+
+"ciflow/h100-symm-mem":
+- torch/csrc/distributed/c10d/symm_mem/**
+- torch/distributed/_symmetric_memory/**
+- test/distributed/**/*mem*
+- test/distributed/**/*mem*/**
--- a/.github/nitpicks.yml
+++ b/.github/nitpicks.yml
@ -10,3 +10,4 @@
  pathFilter:
    - 'torch/csrc/inductor/aoti_torch/c/*'
    - 'torch/csrc/inductor/aoti_torch/generated/*'
+    - 'torch/csrc/stable/c/*'
--- a/.github/scripts/delete_old_branches.py
+++ b/.github/scripts/delete_old_branches.py
@ -1,10 +1,11 @@
 # Delete old branches
 import os
 import re
+from collections.abc import Callable
 from datetime import datetime
 from functools import lru_cache
 from pathlib import Path
-from typing import Any, Callable
+from typing import Any

 from github_utils import gh_fetch_json_dict, gh_graphql
 from gitutils import GitRepo
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@ -8,10 +8,11 @@ import re
 import subprocess
 import sys
 import warnings
+from collections.abc import Callable
 from enum import Enum
 from functools import cache
 from logging import info
-from typing import Any, Callable, Optional
+from typing import Any, Optional
 from urllib.request import Request, urlopen

 import yaml
--- a/.github/scripts/get_workflow_job_id.py
+++ b/.github/scripts/get_workflow_job_id.py
@ -11,7 +11,8 @@ import sys
 import time
 import urllib
 import urllib.parse
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from typing import Any, Optional
 from urllib.request import Request, urlopen


--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@ -3,8 +3,9 @@
 import json
 import os
 import warnings
+from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Any, Callable, cast, Optional, Union
+from typing import Any, cast, Optional, Union
 from urllib.error import HTTPError
 from urllib.parse import quote
 from urllib.request import Request, urlopen
--- a/.github/scripts/gitutils.py
+++ b/.github/scripts/gitutils.py
@ -4,10 +4,10 @@ import os
 import re
 import tempfile
 from collections import defaultdict
-from collections.abc import Iterator
+from collections.abc import Callable, Iterator
 from datetime import datetime
 from functools import wraps
-from typing import Any, Callable, cast, Optional, TypeVar, Union
+from typing import Any, cast, Optional, TypeVar, Union


 T = TypeVar("T")
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -17,12 +17,12 @@ import re
 import time
 import urllib.parse
 from collections import defaultdict
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from dataclasses import dataclass
 from functools import cache
 from pathlib import Path
 from re import Pattern
-from typing import Any, Callable, cast, NamedTuple, Optional
+from typing import Any, cast, NamedTuple, Optional
 from warnings import warn

 import yaml
--- a/.github/workflows/b200-distributed.yml
+++ b/.github/workflows/b200-distributed.yml
@ -37,7 +37,6 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: linux.12xlarge.memory
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed-b200
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '10.0'
--- a/.github/workflows/b200-symm-mem.yml
+++ b/.github/workflows/b200-symm-mem.yml
@ -37,7 +37,6 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: linux.12xlarge.memory
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '10.0'
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -67,6 +67,7 @@ jobs:
          pytorch-linux-jammy-py3.10-gcc11,
          pytorch-linux-jammy-py3-gcc11-inductor-benchmarks,
          pytorch-linux-jammy-py3.12-halide,
+          pytorch-linux-jammy-cuda12.8-py3.12-pallas,
          pytorch-linux-jammy-xpu-n-1-py3,
          pytorch-linux-noble-xpu-n-py3,
          pytorch-linux-noble-xpu-n-py3-inductor-benchmarks,
--- a/.github/workflows/h100-distributed.yml
+++ b/.github/workflows/h100-distributed.yml
@ -37,7 +37,6 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: "linux.c7i.12xlarge"
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-dist
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '9.0'
--- a/.github/workflows/inductor-unittest.yml
+++ b/.github/workflows/inductor-unittest.yml
@ -81,6 +81,32 @@ jobs:
      test-matrix: ${{ needs.inductor-halide-build.outputs.test-matrix }}
    secrets: inherit

+  inductor-pallas-build:
+    name: inductor-pallas-build
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.12-gcc11
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-py3.12-pallas
+      cuda-arch-list: '8.9'
+      runner: linux.8xlarge.memory
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      test-matrix: |
+        { include: [
+          { config: "inductor-pallas", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu" },
+        ]}
+    secrets: inherit
+
+  inductor-pallas-test:
+    name: inductor-pallas-test
+    uses: ./.github/workflows/_linux-test.yml
+    needs: inductor-pallas-build
+    with:
+      build-environment: linux-jammy-py3.12-gcc11
+      docker-image: ${{ needs.inductor-pallas-build.outputs.docker-image }}
+      test-matrix: ${{ needs.inductor-pallas-build.outputs.test-matrix }}
+    secrets: inherit
+
  inductor-triton-cpu-build:
    name: inductor-triton-cpu-build
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/test-b200.yml
+++ b/.github/workflows/test-b200.yml
@ -52,7 +52,6 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: linux.12xlarge.memory
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '10.0'
@ -73,4 +72,4 @@ jobs:
      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.test-matrix }}
      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-    secrets: inherit
+    secrets: inherit
--- a/.github/workflows/test-h100.yml
+++ b/.github/workflows/test-h100.yml
@ -41,7 +41,6 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runner: linux.12xlarge.memory
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '9.0'
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -1402,7 +1402,7 @@ init_command = [
    '--dry-run={{DRYRUN}}',
    'usort==1.0.8.post1',
    'isort==6.0.1',
-    'ruff==0.13.1',  # sync with RUFF
+    'ruff==0.14.4',  # sync with RUFF
 ]
 is_formatter = true

@ -1537,7 +1537,7 @@ init_command = [
    'python3',
    'tools/linter/adapters/pip_init.py',
    '--dry-run={{DRYRUN}}',
-    'ruff==0.13.1',  # sync with PYFMT
+    'ruff==0.14.4',  # sync with PYFMT
 ]
 is_formatter = true

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -736,6 +736,44 @@ if(NOT DEFINED USE_BLAS)
  set(USE_BLAS ON)
 endif()

+# Prioritized Text Linker Optimization
+if(USE_PRIORITIZED_TEXT_FOR_LD)
+
+  set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt")
+  set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld")
+
+  execute_process(
+    COMMAND ${Python_EXECUTABLE}
+            ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py
+            --filein "${LINKER_SCRIPT_FILE_IN}"
+            --fout  "${LINKER_SCRIPT_FILE_OUT}"
+    RESULT_VARIABLE _gen_result
+    OUTPUT_VARIABLE _gen_output
+    ERROR_VARIABLE  _gen_error
+  )
+
+  if(NOT _gen_result EQUAL 0)
+    message(FATAL_ERROR
+      "Failed to generate linker script:\n${_gen_output}\n${_gen_error}")
+  endif()
+
+  append_cxx_flag_if_supported("-ffunction-sections" CMAKE_CXX_FLAGS)
+  append_cxx_flag_if_supported("-fdata-sections" CMAKE_CXX_FLAGS)
+  append_c_flag_if_supported("-ffunction-sections" CMAKE_C_FLAGS)
+  append_c_flag_if_supported("-fdata-sections" CMAKE_C_FLAGS)
+
+  set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -T${LINKER_SCRIPT_FILE_OUT}")
+  set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -T${LINKER_SCRIPT_FILE_OUT}")
+
+else()
+  if(LINUX AND CPU_AARCH64)
+    message(WARNING [[
+    It is strongly recommend to enable linker script optimization for all AArch64 Linux builds.
+    To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
+    ]])
+  endif()
+endif()
+
 # Build libtorch mobile library, which contains ATen/TH ops and native support
 # for TorchScript model, but doesn't contain not-yet-unified caffe2 ops;
 if(INTERN_BUILD_MOBILE)
@ -1402,9 +1440,6 @@ if(BUILD_JNI)
  add_subdirectory(android/pytorch_android)
 endif()

-include(cmake/Summary.cmake)
-caffe2_print_configuration_summary()
-
 # Parse custom debug info
 if(DEFINED USE_CUSTOM_DEBINFO)
  string(REPLACE ";" " " SOURCE_FILES "${USE_CUSTOM_DEBINFO}")
@ -1444,56 +1479,5 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA)
          DESTINATION "${CMAKE_INSTALL_BINDIR}")
 endif()

-if(USE_PRIORITIZED_TEXT_FOR_LD)
-  add_compile_options(
-    $<$<COMPILE_LANGUAGE:C,CXX>:-ffunction-sections>
-    $<$<COMPILE_LANGUAGE:C,CXX>:-fdata-sections>
-  )
-  set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld")
-  set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt")
-
-  add_custom_command(
-    OUTPUT "${LINKER_SCRIPT_FILE_OUT}"
-    COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}"
-    DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}"
-    COMMENT "Generating prioritized text linker files"
-    VERBATIM
-  )
-
-  add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
-
-  if(BUILD_PYTHON)
-    set(LINKER_OPT_TARGETS torch_python)
-  endif()
-
-  if(NOT BUILD_LIBTORCHLESS)
-    list(APPEND LINKER_OPT_TARGETS torch_cpu c10)
-    if(USE_CUDA)
-      list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda)
-    endif()
-    if(USE_XPU)
-      list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu)
-    endif()
-    if(USE_ROCM)
-      list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip)
-    endif()
-  endif()
-
-  foreach(tgt IN LISTS LINKER_OPT_TARGETS)
-    if(TARGET ${tgt})
-      add_dependencies("${tgt}" generate_linker_script)
-      target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}")
-      set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
-    else()
-       message(WARNING "Requested target '${tgt}' for linker script optimization was not found.")
-    endif()
-  endforeach()
-
-else()
-  if(LINUX AND CPU_AARCH64)
-    message(WARNING [[
-    It is strongly recommend to enable linker script optimization for all AArch64 Linux builds.
-    To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
-    ]])
-  endif()
-endif()
+include(cmake/Summary.cmake)
+caffe2_print_configuration_summary()
--- a/6
+++ b/6
@ -210,8 +210,12 @@ torch/backends/cudnn/ @eqy @syed-ahmed @Aidyn-A
 /test/inductor/test_flex_attention.py @drisspg
 /test/inductor/test_flex_decoding.py @drisspg

-# Low Precision GEMMs
+# Low Precision & Grouped GEMMs
 /aten/src/ATen/native/cuda/Blas.cpp @drisspg @slayton58
+/aten/src/ATen/native/cuda/GroupedBlas.cpp @drisspg @slayton58
+/aten/src/ATen/native/cuda/ScaledBlas.cpp @drisspg @slayton58
 /aten/src/ATen/cuda/CUDABlas.cpp @drisspg @slayton58
 /aten/src/ATen/cuda/CUDABlas.h @drisspg @slayton58
+/aten/src/ATen/cuda/CUDAScaledBlas.cpp @drisspg @slayton58
+/aten/src/ATen/cuda/CUDAScaledBlas.h @drisspg @slayton58
 /test/test_scaled_matmul_cuda.py @drisspg @slayton58
--- a/aten/src/ATen/DeviceAccelerator.h
+++ b/aten/src/ATen/DeviceAccelerator.h
@ -94,6 +94,11 @@ TORCH_API inline void resetPeakStats(c10::DeviceIndex device_index) {
  at::getDeviceAllocator(device_type)->resetPeakStats(device_index);
 }

+TORCH_API inline std::pair<size_t, size_t> getMemoryInfo(
+    c10::DeviceIndex device_index) {
+  const auto device_type = getAccelerator(true).value();
+  return at::getDeviceAllocator(device_type)->getMemoryInfo(device_index);
+}
 } // namespace at::accelerator

 namespace at {
--- a/aten/src/ATen/core/CachingHostAllocator.h
+++ b/aten/src/ATen/core/CachingHostAllocator.h
@ -226,8 +226,8 @@ template <
    typename B = HostBlock<S>>
 struct CachingHostAllocatorImpl {
  virtual ~CachingHostAllocatorImpl() {
-    active_ = false;
-    if (pinned_use_background_threads()) {
+    if (active_) {
+      active_ = false;
      getBackgroundThreadPool()->waitWorkComplete();
    }
  }
@ -260,6 +260,7 @@ struct CachingHostAllocatorImpl {
    if (pinned_use_background_threads()) {
      // Launch the background thread and process events in a loop.
      static bool background_thread_flag [[maybe_unused]] = [this] {
+        active_ = true;
        getBackgroundThreadPool()->run([&]() {
          while (active_) {
            process_events();
@ -683,9 +684,9 @@ struct CachingHostAllocatorImpl {
  alignas(hardware_destructive_interference_size) std::mutex events_mutex_;
  std::deque<std::pair<E, B*>> events_; // event queue paired with block

-  // Indicates whether the object is active.
+  // Indicates whether the event-processing thread pool is active.
  // Set to false in the destructor to signal background threads to stop.
-  std::atomic<bool> active_{true};
+  std::atomic<bool> active_{false};
 protected:
  alignas(hardware_destructive_interference_size) HostStatsStaged stats_;
 };
--- a/aten/src/ATen/core/TensorBase.h
+++ b/aten/src/ATen/core/TensorBase.h
@ -245,6 +245,9 @@ class TORCH_API TensorBase {
  size_t weak_use_count() const noexcept {
    return impl_.weak_use_count();
  }
+  bool is_uniquely_owned() const noexcept {
+    return impl_.is_uniquely_owned();
+  }

  std::string toString() const;

--- a/aten/src/ATen/functorch/BatchedTensorImpl.h
+++ b/aten/src/ATen/functorch/BatchedTensorImpl.h
@ -157,6 +157,8 @@ constexpr DispatchKeySet kKeysToPropagateToWrapper({
  DispatchKey::Negative,
  DispatchKey::Conjugate,
  DispatchKey::XLA,
+  DispatchKey::XPU,
+  DispatchKey::HPU,
  DispatchKey::CUDA,
  DispatchKey::CPU,
  DispatchKey::PrivateUse1,
--- a/aten/src/ATen/native/mps/operations/Blas.mm
+++ b/aten/src/ATen/native/mps/operations/Blas.mm
@ -141,6 +141,9 @@ static Tensor& addmv_out_mps_impl(const Tensor& self,
  };

  MPSStream* stream = at::mps::getCurrentMPSStream();
+  if (result.numel() == 0) {
+    return result;
+  }
  Tensor matMulVec = at::mm(mat, vec.unsqueeze(1)).squeeze(1);

  @autoreleasepool {
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -2803,7 +2803,7 @@
 - func: floor_divide.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck   # TensorIterator
  dispatch:
-    CPU, CUDA, MPS: floor_divide_out
+    CPU, CUDA, MPS, MTIA: floor_divide_out
    SparseCPU, SparseCUDA, SparseMPS: floor_divide_out_sparse_zerodim

 - func: floor_divide.Scalar(Tensor self, Scalar other) -> Tensor
@ -4292,6 +4292,7 @@
  dispatch:
    SparseCPU: sparse_sparse_matmul_cpu
    SparseCUDA: sparse_sparse_matmul_cuda
+    SparseMPS: sparse_sparse_matmul_mps
  autogen: _sparse_sparse_matmul.out

 - func: mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor values, Tensor indices)
@ -4383,7 +4384,7 @@
  variants: function, method
  dispatch:
    CompositeExplicitAutograd: mv
-    SparseCPU, SparseCUDA: mv_sparse
+    SparseCPU, SparseCUDA, SparseMPS: mv_sparse

 - func: mv.out(Tensor self, Tensor vec, *, Tensor(a!) out) -> Tensor(a!)
  dispatch:
@ -9832,7 +9833,7 @@
  structured_delegate: erfinv.out
  variants: method, function
  dispatch:
-    SparseCPU, SparseCUDA: erfinv_sparse
+    SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr
  tags: pointwise

@ -9841,7 +9842,7 @@
  structured_delegate: erfinv.out
  variants: method
  dispatch:
-    SparseCPU, SparseCUDA: erfinv_sparse_
+    SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse_
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr_
  tags: pointwise

@ -9851,7 +9852,7 @@
  structured_inherits: TensorIteratorBase
  dispatch:
    CPU, CUDA, MPS: erfinv_out
-    SparseCPU, SparseCUDA: erfinv_sparse_out
+    SparseCPU, SparseCUDA, SparseMPS: erfinv_sparse_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: erfinv_sparse_csr_out
  tags: pointwise

--- a/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
+++ b/aten/src/ATen/native/sparse/mps/SparseMPSTensorMath.mm
@ -10,6 +10,10 @@
 #include <ATen/NativeFunctions.h>
 #else
 #include <ATen/ops/_coalesce_native.h>
+#include <ATen/ops/repeat_interleave_native.h>
+#include <ATen/ops/cumsum.h>
+#include <ATen/ops/_sparse_sparse_matmul_native.h>
+#include <ATen/ops/_sparse_coo_tensor_unsafe.h>
 #include <ATen/ops/_sparse_coo_tensor_unsafe_native.h>
 #include <ATen/ops/cat.h>
 #include <ATen/ops/add_native.h>
@ -888,5 +892,114 @@ static void sparse_mask_intersection_out_mps_kernel(
      /*coalesce_mask=*/false);
 }

+Tensor sparse_sparse_matmul_mps(const Tensor& mat1_, const Tensor& mat2_) {
+  TORCH_CHECK(mat1_.is_sparse() && mat2_.is_sparse(),
+              "sparse_sparse_matmul_mps: both inputs must be sparse COO tensors");
+  TORCH_CHECK(mat1_.is_mps() && mat2_.is_mps(),
+              "sparse_sparse_matmul_mps: both inputs must be on MPS device");
+  TORCH_CHECK(mat1_.dim() == 2 && mat2_.dim() == 2,
+              "sparse_sparse_matmul_mps: both inputs must be 2D matrices");
+  TORCH_CHECK(mat1_.dense_dim() == 0 && mat2_.dense_dim() == 0,
+              "sparse_sparse_matmul_mps: only scalar values supported (dense_dim == 0)");
+  TORCH_CHECK(mat1_.size(1) == mat2_.size(0),
+              "mat1 and mat2 shapes cannot be multiplied (", mat1_.size(0), "x", mat1_.size(1), " and ", mat2_.size(0), "x", mat2_.size(1), ")");
+  TORCH_CHECK(mat1_.scalar_type() == mat2_.scalar_type(),
+              "sparse_sparse_matmul_mps: mat1 dtype ", mat1_.scalar_type(),
+              " does not match mat2 dtype ", mat2_.scalar_type());
+
+  const auto device = mat1_.device();
+
+  auto A = mat1_.coalesce();
+  auto B = mat2_.coalesce();
+
+  const auto I = A.size(0);
+  const auto K = A.size(1);
+  const auto N = B.size(1);
+
+  const auto nnzA = A._nnz();
+  const auto nnzB = B._nnz();
+
+  // Early empty result, return an empty, coalesced tensor
+  if (I == 0 || N == 0 || K == 0 || nnzA == 0 || nnzB == 0) {
+    auto empty_idx = at::empty({2, 0}, at::device(device).dtype(at::kLong));
+    auto empty_val = at::empty({0}, at::device(device).dtype(mat1_.scalar_type()));
+    auto out = _sparse_coo_tensor_unsafe(empty_idx, empty_val, {I, N}, mat1_.options());
+    out._coalesced_(true);
+    return out;
+  }
+
+  const auto computeDtype = at::result_type(mat1_, mat2_);
+
+  auto A_idx = A._indices().contiguous();
+  auto A_val = A._values().to(computeDtype).contiguous();
+  auto A_i = A_idx.select(0, 0).contiguous();
+  auto A_k = A_idx.select(0, 1).contiguous();
+
+  auto B_idx = B._indices().contiguous();
+  auto B_val = B._values().to(computeDtype).contiguous();
+  auto B_k = B_idx.select(0, 0).contiguous();
+  auto B_j = B_idx.select(0, 1).contiguous();
+
+  // csr-style row pointers for B by k (the shared dimension)
+  Tensor row_ptr_B;
+  {
+    auto batch_ptr = at::tensor({0LL, nnzB}, at::device(device).dtype(at::kLong));
+    row_ptr_B = at::empty({K + 1}, at::device(device).dtype(at::kLong));
+    build_row_ptr_per_batch_mps(B_k, batch_ptr, /*B=*/1, /*I=*/K, row_ptr_B);
+  }
+
+  auto row_ptr_B_lo = row_ptr_B.narrow(0, 0, K);
+  auto row_ptr_B_hi = row_ptr_B.narrow(0, 1, K);
+  auto deg_B = row_ptr_B_hi.sub(row_ptr_B_lo);
+
+  auto counts = deg_B.index_select(0, A_k);
+
+  const int64_t P = counts.sum().item<int64_t>();
+  if (P == 0) {
+    auto empty_idx = at::empty({2, 0}, at::device(device).dtype(at::kLong));
+    auto empty_val = at::empty({0}, at::device(device).dtype(mat1_.scalar_type()));
+    auto out = _sparse_coo_tensor_unsafe(empty_idx, empty_val, {I, N}, mat1_.options());
+    out._coalesced_(true);
+    return out;
+  }
+
+  auto group_ids = repeat_interleave_mps(counts);
+
+  // exclusive cumsum of counts
+  auto offsets = cumsum(counts, /*dim=*/0).sub(counts);
+  auto offsets_gather = offsets.index_select(0, group_ids);
+  auto within = at::arange(P, at::device(device).dtype(at::kLong)).sub(offsets_gather);
+
+  // Map each output element to its source B row and position
+  auto k_per_out = A_k.index_select(0, group_ids);
+  auto start_in_B = row_ptr_B.index_select(0, k_per_out);
+  auto seg_index = start_in_B.add(within);
+
+  // Assemble candidate coo pairs and values
+  auto i_out = A_i.index_select(0, group_ids).contiguous();
+  auto j_out = B_j.index_select(0, seg_index).contiguous();
+  auto vA_out = A_val.index_select(0, group_ids).contiguous();
+  auto vB_out = B_val.index_select(0, seg_index).contiguous();
+  auto v_out = vA_out.mul(vB_out);
+
+  // build (2, P) indices
+  auto out_indices = at::empty({2, P}, at::device(device).dtype(at::kLong)).contiguous();
+  out_indices.select(0, 0).copy_(i_out);
+  out_indices.select(0, 1).copy_(j_out);
+
+  auto result = _sparse_coo_tensor_unsafe(
+      out_indices, v_out, {I, N}, mat1_.options().dtype(computeDtype));
+
+  result = result.coalesce();
+
+  if (result.scalar_type() != mat1_.scalar_type()) {
+    auto cast_vals = result._values().to(mat1_.scalar_type());
+    auto out = _sparse_coo_tensor_unsafe(result._indices(), cast_vals, {I, N}, mat1_.options());
+    out._coalesced_(true);
+    return out;
+  }
+  return result;
+}
+
 REGISTER_MPS_DISPATCH(sparse_mask_intersection_out_stub, &sparse_mask_intersection_out_mps_kernel);
 } // namespace at::native
--- a/aten/tools/valgrind.sup
+++ b/aten/tools/valgrind.sup
@ -10,6 +10,13 @@
   ...
 }

+{
+   ignore_empty_generic_uninitialised_conditional_jump
+   Memcheck:Cond
+   fun:_ZN2at6detail13empty_genericEN3c108ArrayRefIlEEPNS1_9AllocatorENS1_14DispatchKeySetENS1_10ScalarTypeESt8optionalINS1_12MemoryFormatEE
+   ...
+}
+
 {
   Cond_cuda
   Memcheck:Cond
--- a/benchmarks/sparse/spmm.py
+++ b/benchmarks/sparse/spmm.py
@ -52,19 +52,18 @@ def test_sparse_coo_and_csr(m, n, k, nnz, test_count):
        start.record()
        coo.matmul(mat)
        stop.record()
-
        times.append(start.elapsed_time(stop))

-        coo_mean_time = sum(times) / len(times)
+    coo_mean_time = sum(times) / len(times)

-        times = []
-        for _ in range(test_count):
-            start.record()
-            csr.matmul(mat)
-            stop.record()
-            times.append(start.elapsed_time(stop))
+    times = []
+    for _ in range(test_count):
+        start.record()
+        csr.matmul(mat)
+        stop.record()
+        times.append(start.elapsed_time(stop))

-            csr_mean_time = sum(times) / len(times)
+    csr_mean_time = sum(times) / len(times)

    return coo_mean_time, csr_mean_time

--- a/c10/core/AutogradState.h
+++ b/c10/core/AutogradState.h
@ -1,6 +1,8 @@
 #pragma once

+#include <c10/core/SafePyObject.h>
 #include <c10/macros/Export.h>
+#include <optional>

 namespace c10 {

@ -15,7 +17,8 @@ struct C10_API AutogradState {
      bool inference_mode,
      bool fw_grad_mode,
      bool multithreading_enabled)
-      : grad_mode_(grad_mode),
+      : graph_exec_group_(std::nullopt),
+        grad_mode_(grad_mode),
        inference_mode_(inference_mode),
        fw_grad_mode_(fw_grad_mode),
        multithreading_enabled_(multithreading_enabled),
@ -41,6 +44,10 @@ struct C10_API AutogradState {
    view_replay_enabled_ = view_replay_enabled;
  }

+  void set_graph_exec_group(std::optional<SafePyObject> group) {
+    graph_exec_group_ = std::move(group);
+  }
+
  bool get_grad_mode() const {
    return grad_mode_;
  }
@ -61,7 +68,12 @@ struct C10_API AutogradState {
    return view_replay_enabled_;
  }

+  const std::optional<SafePyObject>& get_graph_exec_group() const {
+    return graph_exec_group_;
+  }
+
 private:
+  std::optional<SafePyObject> graph_exec_group_;
  bool grad_mode_ : 1;
  bool inference_mode_ : 1;
  bool fw_grad_mode_ : 1;
--- a/c10/core/CachingDeviceAllocator.h
+++ b/c10/core/CachingDeviceAllocator.h
@ -96,6 +96,10 @@ struct C10_API DeviceAllocator : public c10::Allocator {

  // Resets peak memory usage statistics for the specified device
  virtual void resetPeakStats(c10::DeviceIndex device) = 0;
+
+  // Return the free memory size and total memory size in bytes for the
+  // specified device.
+  virtual std::pair<size_t, size_t> getMemoryInfo(c10::DeviceIndex device) = 0;
 };

 // This function is used to get the DeviceAllocator for a specific device type
--- a/c10/core/SafePyObject.h
+++ b/c10/core/SafePyObject.h
@ -44,7 +44,7 @@ struct C10_API SafePyObject {
      (*other.pyinterpreter_)->incref(other.data_);
    }
    if (data_ != nullptr) {
-      (*pyinterpreter_)->decref(data_, /*has_pyobj_slot*/ false);
+      (*pyinterpreter_)->decref(data_);
    }
    data_ = other.data_;
    pyinterpreter_ = other.pyinterpreter_;
@ -53,7 +53,7 @@ struct C10_API SafePyObject {

  ~SafePyObject() {
    if (data_ != nullptr) {
-      (*pyinterpreter_)->decref(data_, /*has_pyobj_slot*/ false);
+      (*pyinterpreter_)->decref(data_);
    }
  }

--- a/c10/core/StorageImpl.cpp
+++ b/c10/core/StorageImpl.cpp
@ -48,6 +48,30 @@ void warnDeprecatedDataPtr() {
  TORCH_CHECK(false, "Cannot access data pointer of Storage that is invalid.");
 }

+void StorageImpl::incref_pyobject() const {
+  // Because intrusive_ptr incref uses relaxed memory order, we need to
+  // do an acquire fence to ensure that the kHasPyObject bit was
+  // observed before the load of the PyObject* below.
+  // NB: This is a no-op on x86/x86-64
+  std::atomic_thread_fence(std::memory_order_acquire);
+
+  PyObject* obj = pyobj_slot_.load_pyobj();
+  (*pyobj_slot_.pyobj_interpreter())->incref(obj);
+}
+
+void StorageImpl::decref_pyobject() const {
+  PyObject* obj = pyobj_slot_.load_pyobj();
+  (*pyobj_slot_.pyobj_interpreter())->decref(obj);
+}
+
+bool StorageImpl::try_incref_pyobject() const {
+  c10::impl::PyInterpreter* interp = pyobj_slot_.pyobj_interpreter();
+  if (C10_UNLIKELY(!interp)) {
+    return false;
+  }
+  return (*interp)->try_incref(pyobj_slot_);
+}
+
 void SetStorageImplCreate(DeviceType t, StorageImplCreateHelper fptr) {
  // Allowlist verification.
  // Only if the devicetype is in the allowlist,
--- a/c10/core/StorageImpl.h
+++ b/c10/core/StorageImpl.h
@ -105,6 +105,12 @@ struct C10_API StorageImpl : public c10::intrusive_ptr_target {
    data_ptr_.clear();
  }

+  void incref_pyobject() const override final;
+
+  void decref_pyobject() const override final;
+
+  bool try_incref_pyobject() const override final;
+
  size_t nbytes() const {
    // OK to do this instead of maybe_as_int as nbytes is guaranteed positive
    TORCH_CHECK(!size_bytes_is_heap_allocated_);
@ -370,4 +376,14 @@ C10_API c10::intrusive_ptr<c10::StorageImpl> make_storage_impl(
    bool resizable,
    std::optional<at::Device> device_opt);

+namespace detail {
+template <class T>
+struct TargetTraits<
+    T,
+    std::enable_if_t<
+        std::is_base_of_v<c10::StorageImpl, std::remove_cv_t<T>>>> {
+  static constexpr bool can_have_pyobject = true;
+};
+} // namespace detail
+
 } // namespace c10
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@ -277,7 +277,6 @@ void TensorImpl::release_resources() {
  if (storage_) {
    storage_ = {};
  }
-  pyobj_slot_.maybe_destroy_pyobj();
 }

 #ifndef C10_DISABLE_TENSORIMPL_EXTENSIBILITY
@ -989,6 +988,30 @@ void TensorImpl::empty_tensor_restride_symint(MemoryFormat memory_format) {
  }
 }

+void TensorImpl::incref_pyobject() const {
+  // Because intrusive_ptr incref uses relaxed memory order, we need to
+  // do an acquire fence to ensure that the kHasPyObject bit was
+  // observed before the load of the PyObject* below.
+  // NB: This is a no-op on x86/x86-64
+  std::atomic_thread_fence(std::memory_order_acquire);
+
+  PyObject* obj = pyobj_slot_.load_pyobj();
+  (*pyobj_slot_.pyobj_interpreter())->incref(obj);
+}
+
+void TensorImpl::decref_pyobject() const {
+  PyObject* obj = pyobj_slot_.load_pyobj();
+  (*pyobj_slot_.pyobj_interpreter())->decref(obj);
+}
+
+bool TensorImpl::try_incref_pyobject() const {
+  c10::impl::PyInterpreter* interp = pyobj_slot_.pyobj_interpreter();
+  if (C10_UNLIKELY(!interp)) {
+    return false;
+  }
+  return (*interp)->try_incref(pyobj_slot_);
+}
+
 namespace impl {

 namespace {
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@ -2176,6 +2176,12 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    return &pyobj_slot_;
  }

+  void incref_pyobject() const override final;
+
+  void decref_pyobject() const override final;
+
+  bool try_incref_pyobject() const override final;
+
 private:
  // See NOTE [std::optional operator usage in CUDA]
  // We probably don't want to expose this publicly until
@ -3077,6 +3083,17 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
  friend class C10_TensorImpl_Size_Check_Dummy_Class;
 };

+namespace detail {
+
+template <class T>
+struct TargetTraits<
+    T,
+    std::enable_if_t<std::is_base_of_v<c10::TensorImpl, std::remove_cv_t<T>>>> {
+  static constexpr bool can_have_pyobject = true;
+};
+
+} // namespace detail
+
 // Note [TensorImpl size constraints]
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 // Changed the size of TensorImpl?  If the size went down, good for
--- a/c10/core/impl/PyInterpreter.cpp
+++ b/c10/core/impl/PyInterpreter.cpp
@ -11,8 +11,11 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {

  void incref(PyObject* pyobj) const override {} // do nothing

-  void decref(PyObject* pyobj, bool has_pyobj_slot) const override {
-  } // do nothing
+  void decref(PyObject* pyobj) const override {} // do nothing
+
+  bool try_incref(const c10::impl::PyObjectSlot& pyobj_slot) const override {
+    return false;
+  }

 #define PANIC(m)              \
  TORCH_INTERNAL_ASSERT(      \
@ -20,6 +23,10 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
      "attempted to call " #m \
      " on a Tensor with nontrivial PyObject after corresponding interpreter died")

+  size_t refcnt(PyObject* pyobj) const override {
+    PANIC(refcnt);
+  }
+
  c10::intrusive_ptr<TensorImpl> detach(const TensorImpl* self) const override {
    PANIC(detach);
  }
--- a/c10/core/impl/PyInterpreter.h
+++ b/c10/core/impl/PyInterpreter.h
@ -18,6 +18,9 @@ namespace c10 {
 struct IValue;
 class OperatorHandle;
 struct TensorImpl;
+namespace impl {
+struct PyObjectSlot;
+} // namespace impl
 } // namespace c10

 namespace torch::jit {
@ -126,9 +129,12 @@ struct C10_API PyInterpreterVTable {

  // Run Py_INCREF on a PyObject.
  virtual void incref(PyObject* pyobj) const = 0;
-  // Run Py_DECREF on a PyObject.  We DO NOT assume the GIL is held on call
-  // See NOTE [PyInterpreter::decref takes a `has_pyobj_slot` arg]
-  virtual void decref(PyObject* pyobj, bool has_pyobj_slot) const = 0;
+  // Run Py_DECREF on a PyObject.  We DO NOT assume the GIL is held on call.
+  virtual void decref(PyObject* pyobj) const = 0;
+  // Run PyUnstable_TryIncRef on a PyObject if it's not NULL.
+  virtual bool try_incref(const c10::impl::PyObjectSlot& pyobj_slot) const = 0;
+  // Run Py_REFCNT on a PyObject.
+  virtual size_t refcnt(PyObject* pyobj) const = 0;

  // Perform a detach by deferring to the __torch_dispatch__ implementation of
  // detach, which will also arrange for the PyObject to get copied in this
--- a/c10/core/impl/PyObjectSlot.cpp
+++ b/c10/core/impl/PyObjectSlot.cpp
@ -1,56 +0,0 @@
-#include <c10/core/impl/PyObjectSlot.h>
-
-namespace c10::impl {
-
-PyObjectSlot::PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}
-
-PyObjectSlot::~PyObjectSlot() {
-  maybe_destroy_pyobj();
-}
-
-void PyObjectSlot::maybe_destroy_pyobj() {
-  if (owns_pyobj()) {
-    TORCH_INTERNAL_ASSERT(pyobj_interpreter_ != nullptr);
-    TORCH_INTERNAL_ASSERT(pyobj_ != nullptr);
-    (*pyobj_interpreter_.load(std::memory_order_acquire))
-        ->decref(_unchecked_untagged_pyobj(), /*has_pyobj_slot*/ true);
-    // NB: this destructor can only be entered when there are no
-    // references to this C++ object (obviously), NOR any references
-    // to the PyObject (if there are references to the PyObject,
-    // then the PyObject holds an owning reference to the tensor).
-    // So it is OK to clear pyobj_ here as it is impossible for it to
-    // be used again (modulo weak reference races)
-    pyobj_ = nullptr; // for safety
-  }
-}
-
-PyInterpreter* PyObjectSlot::pyobj_interpreter() {
-  return pyobj_interpreter_.load(std::memory_order_acquire);
-}
-
-PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
-  // NOLINTNEXTLINE(performance-no-int-to-ptr)
-  return reinterpret_cast<PyObject*>(
-      reinterpret_cast<uintptr_t>(pyobj_) & ~0x1ULL);
-}
-
-PyInterpreter& PyObjectSlot::load_pyobj_interpreter() const {
-  auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
-  if (interpreter) {
-    return *interpreter;
-  }
-  TORCH_CHECK(false, "cannot access PyObject for Tensor - no interpreter set");
-}
-
-bool PyObjectSlot::owns_pyobj() {
-  // NOLINTNEXTLINE(performance-no-int-to-ptr)
-  return reinterpret_cast<uintptr_t>(pyobj_) & 1;
-}
-
-void PyObjectSlot::set_owns_pyobj(bool b) {
-  // NOLINTNEXTLINE(performance-no-int-to-ptr)
-  pyobj_ = reinterpret_cast<PyObject*>(
-      reinterpret_cast<uintptr_t>(_unchecked_untagged_pyobj()) | b);
-}
-
-} // namespace c10::impl
--- a/c10/core/impl/PyObjectSlot.h
+++ b/c10/core/impl/PyObjectSlot.h
@ -8,117 +8,70 @@

 #include <atomic>

+namespace torch::utils {
+class PyObjectPreservation;
+}
+
 namespace c10::impl {

 struct C10_API PyObjectSlot {
 public:
-  PyObjectSlot();
-
-  ~PyObjectSlot();
-
-  void maybe_destroy_pyobj();
-
-  // Associate the TensorImpl with the specified PyObject, and, if necessary,
-  // also tag the interpreter.
-  //
-  // NB: This lives in a header so that we can inline away the switch on status
-  //
-  // NB: THIS FUNCTION CAN RAISE AN EXCEPTION.  Make sure to clean up after
-  // PyObject if necessary!
-  void init_pyobj(PyObject* pyobj) {
-    pyobj_interpreter_.store(
-        getGlobalPyInterpreter(), std::memory_order_relaxed);
-    pyobj_ = pyobj;
-  }
+  PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}

  // Query the PyObject interpreter.  This may return null if there is no
-  // interpreter.  This is racy!
-  PyInterpreter* pyobj_interpreter();
-
-  PyObject* _unchecked_untagged_pyobj() const;
-
-  // Test the interpreter tag.  If tagged for the current interpreter, return
-  // a non-nullopt (but possibly null) PyObject.  If (possibly) untagged,
-  // returns a nullopt.  If it is definitely invalid, raises an error.
-  //
-  // If `ignore_hermetic_tls` is false and this function is called from a
-  // hermetic context (ie, `HermeticPyObjectTLS::get_state()` is true), then
-  // nullopt is returned. If `ignore_hermetic_tls` is true, then the hermetic
-  // context is ignored, allowing you to check the interpreter tag of a
-  // nonhermetic PyObject from within a hermetic context. This is necessary
-  // because there are some cases where the deallocator function of a
-  // nonhermetic PyObject is called from within a hermetic context, so it must
-  // be properly treated as a nonhermetic PyObject.
-  //
-  // NB: this lives in header so that we can avoid actually creating the
-  // std::optional
-
-  // @todo alban: I'm not too sure what's going on here, we can probably delete
-  // it but it's worthwhile making sure
-  std::optional<PyObject*> check_pyobj(bool ignore_hermetic_tls = false) const {
-    impl::PyInterpreter* interpreter =
-        pyobj_interpreter_.load(std::memory_order_acquire);
-    if (interpreter == nullptr) {
-      return std::nullopt;
-    }
-
-    if (!ignore_hermetic_tls && c10::impl::HermeticPyObjectTLS::get_state()) {
-      return std::nullopt;
-    } else {
-      return _unchecked_untagged_pyobj();
-    }
+  // interpreter.
+  PyInterpreter* pyobj_interpreter() const {
+    return pyobj_interpreter_.load(std::memory_order_acquire);
  }

-  PyInterpreter& load_pyobj_interpreter() const;
+  PyInterpreter& load_pyobj_interpreter() const {
+    auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
+    TORCH_INTERNAL_ASSERT(
+        interpreter, "cannot access PyObject for Tensor - no interpreter set");
+    return *interpreter;
+  }

-  bool owns_pyobj();
+  PyObject* load_pyobj() const {
+    return pyobj_.load(std::memory_order_acquire);
+  }

-  void set_owns_pyobj(bool b);
+  bool has_unique_reference() const {
+    PyObject* pyobj = load_pyobj();
+    return pyobj != nullptr && load_pyobj_interpreter()->refcnt(pyobj) == 1;
+  }
+
+  void clear() {
+    pyobj_.store(nullptr, std::memory_order_relaxed);
+    pyobj_interpreter_.store(nullptr, std::memory_order_relaxed);
+  }
+
+  // Non thread-safe swap
+  void swap(PyObjectSlot& other) noexcept {
+    PyInterpreter* tmp_interpreter =
+        pyobj_interpreter_.load(std::memory_order_relaxed);
+    pyobj_interpreter_.store(
+        other.pyobj_interpreter_.load(std::memory_order_relaxed),
+        std::memory_order_relaxed);
+    other.pyobj_interpreter_.store(tmp_interpreter, std::memory_order_relaxed);
+
+    PyObject* tmp_pyobj = pyobj_.load(std::memory_order_relaxed);
+    pyobj_.store(
+        other.pyobj_.load(std::memory_order_relaxed),
+        std::memory_order_relaxed);
+    other.pyobj_.store(tmp_pyobj, std::memory_order_relaxed);
+  }

 private:
-  // This field contains the interpreter tag for this object.  See
-  // Note [Python interpreter tag] for general context
-  //
-  // Note [Memory ordering on Python interpreter tag]
-  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  // What memory_order do we need when accessing this atomic?  We don't
-  // need a single total modification order (as provided by
-  // memory_order_seq_cst) as pyobj_interpreter_ is monotonic: it can only
-  // transition from -1 to some positive integer and never changes afterwards.
-  // Because there is only one modification, it trivially already has a total
-  // modification order (e.g., we don't need fences or locked instructions on
-  // x86)
-  //
-  // In fact, one could make a reasonable argument that relaxed reads are OK,
-  // due to the presence of external locking (GIL) to ensure that interactions
-  // with other data structures are still correctly synchronized, so that
-  // we fall in the "Single-Location Data Structures" case as described in
-  // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf
-  // However, on x86, it doesn't matter if I use acquire or relaxed on the load
-  // as I get the same assembly in both cases.  So I just use the more
-  // conservative acquire (which will impede compiler optimizations but I don't
-  // care)
+  // This is now always the global interpreter if the PyObject is set.
+  // Maybe we can remove this field some day...
  std::atomic<PyInterpreter*> pyobj_interpreter_;

-  // This field contains a reference to a PyObject representing this Tensor.
-  // If pyobj is nullptr, when we transfer Tensor to Python, we allocate a new
-  // PyObject for it and set this field.  This field does not have to be
-  // protected by an atomic as it is only allowed to be accessed when you hold
-  // the GIL, or during destruction of the tensor.
-  //
-  // When a PyObject dies, you are obligated to clear this field
-  // (otherwise, you will try to use-after-free the pyobj); this currently
-  // occurs in THPVariable_clear in torch/csrc/autograd/python_variable.cpp
-  //
-  // NB: Ordinarily, this should not be a strong reference, as if the
-  // PyObject owns the Tensor, this would create a reference cycle.
-  // However, sometimes this ownership flips.  To track who owns
-  // who, this has a single pointer tag indicating whether or not the
-  // C++ object owns the PyObject (the common case, zero, means PyObject
-  // owns the C++ object); see _unchecked_untagged_pyobj for raw access
-  // or check_pyobj for checked access.  See references to PyObject
-  // resurrection in torch/csrc/autograd/python_variable.cpp
-  PyObject* pyobj_;
+  // The PyObject representing this Tensor or nullptr. Ownership is managed
+  // by intrusive_ptr. By the time the PyObjectSlot is destroyed, this
+  // reference is already dead.
+  std::atomic<PyObject*> pyobj_;
+
+  friend class torch::utils::PyObjectPreservation;
 };

 } // namespace c10::impl
--- a/c10/cuda/CUDACachingAllocator.h
+++ b/c10/cuda/CUDACachingAllocator.h
@ -345,6 +345,13 @@ class CUDAAllocator : public DeviceAllocator {
      c10::DeviceIndex device,
      std::shared_ptr<AllocatorState> pps) = 0;
  virtual std::string name() = 0;
+  std::pair<size_t, size_t> getMemoryInfo(c10::DeviceIndex device) override {
+    c10::DeviceGuard device_guard({at::kCUDA, device});
+    size_t free = 0;
+    size_t total = 0;
+    C10_CUDA_CHECK(cudaMemGetInfo(&free, &total));
+    return {free, total};
+  }
 };

 // Allocator object, statically initialized
--- a/c10/test/build.bzl
+++ b/c10/test/build.bzl
@ -66,6 +66,15 @@ def define_targets(rules):
        ],
    )

+    rules.cc_test(
+        name = "util/nofatal_test",
+        srcs = ["util/nofatal_test.cpp"],
+        deps = [
+            "//c10/util:base",
+            "@com_google_googletest//:gtest_main",
+        ],
+    )
+
    rules.cc_test(
        name = "util/ssize_test",
        srcs = ["util/ssize_test.cpp"],
--- a/c10/test/util/nofatal_test.cpp
+++ b/c10/test/util/nofatal_test.cpp
@ -0,0 +1,53 @@
+#include <gtest/gtest.h>
+
+#include <c10/util/Exception.h>
+#include <c10/util/Logging.h>
+
+namespace {
+template <typename T>
+inline void expectThrowsEq(T&& fn, const char* expected_msg) {
+  try {
+    std::forward<T>(fn)();
+  } catch (const c10::Error& e) {
+    EXPECT_TRUE(
+        std::string(e.what_without_backtrace()).find(expected_msg) !=
+        std::string::npos);
+    return;
+  }
+  ADD_FAILURE() << "Expected to throw exception with message \"" << expected_msg
+                << "\" but didn't throw";
+}
+} // namespace
+
+TEST(NofatalTest, TorchCheckComparisons) {
+  // quick make sure that no-op works as expected
+  TORCH_CHECK_EQ(1, 1) << "i am a silly message " << 1;
+  expectThrowsEq(
+      []() { TORCH_CHECK_EQ(1, 2) << "i am a silly message " << 1; },
+      "Check failed: 1 == 2 (1 vs. 2). i am a silly message 1");
+  expectThrowsEq(
+      []() { TORCH_CHECK_NE(2, 2); }, "Check failed: 2 != 2 (2 vs. 2).");
+  expectThrowsEq(
+      []() { TORCH_CHECK_LT(2, 2); }, "Check failed: 2 < 2 (2 vs. 2).");
+  expectThrowsEq(
+      []() { TORCH_CHECK_LE(3, 2); }, "Check failed: 3 <= 2 (3 vs. 2).");
+  expectThrowsEq(
+      []() { TORCH_CHECK_GT(2, 2); }, "Check failed: 2 > 2 (2 vs. 2).");
+  expectThrowsEq(
+      []() { TORCH_CHECK_GE(2, 3); }, "Check failed: 2 >= 3 (2 vs. 3).");
+  expectThrowsEq(
+      []() {
+        void* p = nullptr;
+        TORCH_CHECK_NOTNULL(p);
+      },
+      "Check failed: 'p' must be non NULL.");
+
+#if GTEST_HAS_DEATH_TEST
+#ifndef NDEBUG
+  // if dbg build, DCHECK should result in deth
+  EXPECT_DEATH(TORCH_DCHECK_EQ(1, 2), "Check failed");
+#else
+  TORCH_DCHECK_EQ(1, 2); // no-op
+#endif
+#endif // GTEST_HAS_DEATH_TEST
+}
--- a/c10/util/Exception.h
+++ b/c10/util/Exception.h
@ -702,6 +702,98 @@ namespace c10::detail {
 #define TORCH_CHECK_ARG(cond, argN, ...) \
  TORCH_CHECK(cond, "invalid argument ", argN, ": ", __VA_ARGS__)

+#ifndef FATAL_IF
+#ifdef C10_USE_GLOG
+#define FATAL_IF(condition)                                              \
+  condition ? (void)0                                                    \
+            : ::c10::LoggerVoidify() &                                   \
+          ::c10::MessageLogger(__FILE__, __LINE__, ::google::GLOG_FATAL) \
+              .stream()
+#else
+#define FATAL_IF(condition)            \
+  condition ? (void)0                  \
+            : ::c10::LoggerVoidify() & \
+          ::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_FATAL).stream()
+#endif
+#endif
+
+#ifndef NON_FATAL_IF
+#ifdef C10_USE_GLOG
+#define NON_FATAL_IF(condition)                                \
+  condition ? (void)0                                          \
+            : ::c10::LoggerVoidify() &                         \
+          ::c10::MessageLogger(                                \
+              __FILE__, __LINE__, ::google::GLOG_FATAL, false) \
+              .stream()
+#else
+#define NON_FATAL_IF(condition)                                              \
+  condition ? (void)0                                                        \
+            : ::c10::LoggerVoidify() &                                       \
+          ::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_FATAL, false) \
+              .stream()
+#endif
+#endif
+
+// Binary comparison check macros
+#define TORCH_CHECK_OP(val1, val2, op)                                      \
+  NON_FATAL_IF(((val1)op(val2)))                                            \
+      << "Check failed: " #val1 " " #op " " #val2 " (" << (val1) << " vs. " \
+      << (val2) << "). "
+
+#define TORCH_DCHECK_OP(val1, val2, op)                                       \
+  FATAL_IF(((val1)op(val2))) << "Check failed: " #val1 " " #op " " #val2 " (" \
+                             << (val1) << " vs. " << (val2) << "). "
+
+#define TORCH_CHECK_EQ(val1, val2) TORCH_CHECK_OP(val1, val2, ==)
+#define TORCH_CHECK_NE(val1, val2) TORCH_CHECK_OP(val1, val2, !=)
+#define TORCH_CHECK_LE(val1, val2) TORCH_CHECK_OP(val1, val2, <=)
+#define TORCH_CHECK_LT(val1, val2) TORCH_CHECK_OP(val1, val2, <)
+#define TORCH_CHECK_GE(val1, val2) TORCH_CHECK_OP(val1, val2, >=)
+#define TORCH_CHECK_GT(val1, val2) TORCH_CHECK_OP(val1, val2, >)
+
+// Debug versions of TORCH_CHECK_OP macros
+#ifndef NDEBUG
+#define TORCH_DCHECK_EQ(val1, val2) TORCH_DCHECK_OP(val1, val2, ==)
+#define TORCH_DCHECK_NE(val1, val2) TORCH_DCHECK_OP(val1, val2, !=)
+#define TORCH_DCHECK_LE(val1, val2) TORCH_DCHECK_OP(val1, val2, <=)
+#define TORCH_DCHECK_LT(val1, val2) TORCH_DCHECK_OP(val1, val2, <)
+#define TORCH_DCHECK_GE(val1, val2) TORCH_DCHECK_OP(val1, val2, >=)
+#define TORCH_DCHECK_GT(val1, val2) TORCH_DCHECK_OP(val1, val2, >)
+#else // !NDEBUG
+// Optimized versions - generate no code
+#define TORCH_DCHECK_EQ(val1, val2) \
+  while (false)                     \
+  TORCH_DCHECK_OP(val1, val2, ==)
+#define TORCH_DCHECK_NE(val1, val2) \
+  while (false)                     \
+  TORCH_DCHECK_OP(val1, val2, !=)
+#define TORCH_DCHECK_LE(val1, val2) \
+  while (false)                     \
+  TORCH_DCHECK_OP(val1, val2, <=)
+#define TORCH_DCHECK_LT(val1, val2) \
+  while (false)                     \
+  TORCH_DCHECK_OP(val1, val2, <)
+#define TORCH_DCHECK_GE(val1, val2) \
+  while (false)                     \
+  TORCH_DCHECK_OP(val1, val2, >=)
+#define TORCH_DCHECK_GT(val1, val2) \
+  while (false)                     \
+  TORCH_DCHECK_OP(val1, val2, >)
+#endif // NDEBUG
+
+// Null pointer check macro
+#define TORCH_CHECK_NOTNULL(val) \
+  ::c10::CheckNotNull(__FILE__, __LINE__, #val, (val), false)
+
+#ifndef NDEBUG
+#define TORCH_DCHECK_NOTNULL(val) \
+  ::c10::CheckNotNull(__FILE__, __LINE__, #val, (val), true)
+#else // !NDEBUG
+#define TORCH_DCHECK_NOTNULL(val) \
+  while (false)                   \
+  TORCH_CHECK_NOTNULL(val)
+#endif // NDEBUG
+
 // ----------------------------------------------------------------------------
 // Deprecated macros
 // ----------------------------------------------------------------------------
--- a/c10/util/Logging.cpp
+++ b/c10/util/Logging.cpp
@ -291,6 +291,32 @@ namespace c10 {
 using fLB::FLAGS_logtostderr;
 using fLI::FLAGS_minloglevel;
 using fLI::FLAGS_v;
+
+MessageLogger::MessageLogger(
+    const char* file,
+    int line,
+    int severity,
+    bool exit_on_fatal)
+    : stream_(), severity_(severity), exit_on_fatal_(exit_on_fatal) {}
+
+MessageLogger::~MessageLogger() noexcept(false) {
+  if (severity_ == ::google::GLOG_FATAL) {
+    DealWithFatal();
+  }
+}
+
+std::stringstream& MessageLogger::stream() {
+  return stream_;
+}
+
+void MessageLogger::DealWithFatal() {
+  if (exit_on_fatal_) {
+    LOG(FATAL) << stream_.str();
+  } else {
+    throw c10::Error(stream_.str(), nullptr, nullptr);
+  }
+}
+
 } // namespace c10

 C10_DEFINE_int(
@ -412,17 +438,16 @@ void ShowLogInfoToStderr() {
  FLAGS_caffe2_log_level = GLOG_INFO;
 }

-MessageLogger::MessageLogger(const char* file, int line, int severity)
-    : severity_(severity) {
+MessageLogger::MessageLogger(
+    const char* file,
+    int line,
+    int severity,
+    bool exit_on_fatal)
+    : severity_(severity), exit_on_fatal_(exit_on_fatal) {
  if (severity_ < FLAGS_caffe2_log_level) {
    // Nothing needs to be logged.
    return;
  }
-#ifdef ANDROID
-  tag_ = "native";
-#else // !ANDROID
-  tag_ = "";
-#endif // ANDROID

  time_t rawtime = 0;
  time(&rawtime);
@ -458,7 +483,7 @@ MessageLogger::MessageLogger(const char* file, int line, int severity)
 }

 // Output the contents of the stream to the proper channel on destruction.
-MessageLogger::~MessageLogger() {
+MessageLogger::~MessageLogger() noexcept(false) {
  if (severity_ < FLAGS_caffe2_log_level) {
    // Nothing needs to be logged.
    return;
@ -498,6 +523,18 @@ MessageLogger::~MessageLogger() {
  }
 }

+std::stringstream& MessageLogger::stream() {
+  return stream_;
+}
+
+void MessageLogger::DealWithFatal() {
+  if (exit_on_fatal_) {
+    abort();
+  } else {
+    throw c10::Error(stream_.str(), nullptr, nullptr);
+  }
+}
+
 } // namespace c10

 #endif // !C10_USE_GLOG
--- a/c10/util/intrusive_ptr.h
+++ b/c10/util/intrusive_ptr.h
@ -12,6 +12,10 @@ template <typename, typename...>
 class class_;
 }

+namespace torch::utils {
+class PyObjectPreservation;
+}
+
 namespace c10 {
 class intrusive_ptr_target;
 namespace raw {
@ -33,6 +37,8 @@ constexpr uint64_t kImpracticallyHugeWeakReferenceCount =
 constexpr uint64_t kReferenceCountOne = 1;
 constexpr uint64_t kWeakReferenceCountOne = (kReferenceCountOne << 32);
 constexpr uint64_t kUniqueRef = (kReferenceCountOne | kWeakReferenceCountOne);
+// Indicates whether the object has a PyObject wrapper.
+constexpr uint64_t kHasPyObject = (uint64_t(1) << 63);

 template <class TTarget>
 struct intrusive_target_default_null_type final {
@ -55,7 +61,11 @@ inline uint32_t refcount(uint64_t combined_refcount) {
 }

 inline uint32_t weakcount(uint64_t combined_refcount) {
-  return static_cast<uint32_t>(combined_refcount >> 32);
+  return static_cast<uint32_t>((combined_refcount & ~kHasPyObject) >> 32);
+}
+
+inline bool has_pyobject(uint64_t combined_refcount) {
+  return (combined_refcount & kHasPyObject) != 0;
 }

 // The only requirement for refcount increment is that it happens-before
@ -66,12 +76,6 @@ inline uint64_t atomic_combined_refcount_increment(
  return combined_refcount.fetch_add(inc, std::memory_order_relaxed) + inc;
 }

-inline uint32_t atomic_refcount_increment(
-    std::atomic<uint64_t>& combined_refcount) {
-  return detail::refcount(atomic_combined_refcount_increment(
-      combined_refcount, kReferenceCountOne));
-}
-
 inline uint32_t atomic_weakcount_increment(
    std::atomic<uint64_t>& combined_refcount) {
  return detail::weakcount(atomic_combined_refcount_increment(
@ -99,6 +103,11 @@ inline uint32_t atomic_weakcount_decrement(
      combined_refcount, kWeakReferenceCountOne));
 }

+template <class T, class = void>
+struct TargetTraits {
+  static constexpr bool can_have_pyobject = false;
+};
+
 } // namespace detail

 /**
@ -155,6 +164,23 @@ class C10_API intrusive_ptr_target {
  // we can atomically operate on both at the same time for performance
  // and defined behaviors.
  //
+  // Note [PyObject preservation for Tensor and Storages]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // intrusive_ptr has special support for preserving PyObject wrappers
+  // for TensorImpl and StorageImpl. The most significant bit (kHasPyObject) of
+  // the combined_refcount_ is used to indicate whether the object has a
+  // PyObject wrapper.
+  //
+  //   - The PyObject, if it exists, holds a strong reference to the
+  //     intrusive_ptr_target.
+  //
+  //   - When the refcount goes from 1 to 2, we incref the PyObject.
+  //
+  //   - When the refcount goes from 2 to 1, we decref the PyObject.
+  //
+  // In other words, the intrusive_ptr keeps the PyObject alive as long as there
+  // are other C++ references to the intrusive_ptr_target.
+
  mutable std::atomic<uint64_t> combined_refcount_;
  static_assert(sizeof(std::atomic<uint64_t>) == 8);
  static_assert(alignof(std::atomic<uint64_t>) == 8);
@ -172,6 +198,8 @@ class C10_API intrusive_ptr_target {
  template <typename T>
  friend struct ExclusivelyOwnedTensorTraits;

+  friend class torch::utils::PyObjectPreservation;
+
 protected:
  // protected destructor. We never want to destruct intrusive_ptr_target*
  // directly.
@ -255,6 +283,16 @@ class C10_API intrusive_ptr_target {
   */
  virtual void release_resources() {}

+  /**
+   * These two methods are called when the refcount transitions between one
+   * and two and the object has a PyObject wrapper.
+   */
+  virtual void incref_pyobject() const {}
+  virtual void decref_pyobject() const {}
+  virtual bool try_incref_pyobject() const {
+    return false;
+  }
+
  uint32_t refcount(std::memory_order order = std::memory_order_relaxed) const {
    return detail::refcount(combined_refcount_.load(order));
  }
@ -265,6 +303,15 @@ class C10_API intrusive_ptr_target {
  }
 };

+namespace detail {
+template <>
+struct TargetTraits<c10::intrusive_ptr_target> {
+  // A generic intrusive_ptr<intrusive_ptr_target> may actually be a TensorImpl
+  // or StorageImpl, so we have to allow for PyObject support.
+  static constexpr bool can_have_pyobject = true;
+};
+} // namespace detail
+
 template <class TTarget, class NullType>
 class weak_intrusive_ptr;

@ -314,18 +361,34 @@ class intrusive_ptr final {

  void retain_() {
    if (target_ != NullType::singleton()) {
-      uint32_t new_refcount =
-          detail::atomic_refcount_increment(target_->combined_refcount_);
+      uint64_t combined = detail::atomic_combined_refcount_increment(
+          target_->combined_refcount_, detail::kReferenceCountOne);
+      uint32_t new_refcount = detail::refcount(combined);
      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
          new_refcount != 1,
          "intrusive_ptr: Cannot increase refcount after it reached zero.");
+
+      if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
+        // If the refcount transitioned from 1 to 2, we need to incref the
+        // PyObject. In other words, we need to ensure that the PyObject stays
+        // alive now that we have a C++ reference to this object in addition to
+        // the PyObject itself.
+        if (C10_UNLIKELY(
+                detail::has_pyobject(combined) &&
+                detail::refcount(combined) == 2)) {
+          target_->incref_pyobject();
+        }
+      } else {
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+            !detail::has_pyobject(combined),
+            "TargetTraits indicates that type cannot have PyObject, but refcount has PyObject bit set.");
+      }
    }
  }

  void reset_() noexcept {
    if (target_ != NullType::singleton()) {
-      if (target_->combined_refcount_.load(std::memory_order_acquire) ==
-          detail::kUniqueRef) {
+      if (is_uniquely_owned()) {
        // Both counts are 1, so there are no weak references and
        // we are releasing the last strong reference. No other
        // threads can observe the effects of this target_ deletion
@ -337,9 +400,10 @@ class intrusive_ptr final {

      auto combined_refcount = detail::atomic_combined_refcount_decrement(
          target_->combined_refcount_, detail::kReferenceCountOne);
-      if (detail::refcount(combined_refcount) == 0) {
-        bool should_delete =
-            (combined_refcount == detail::kWeakReferenceCountOne);
+      uint32_t new_refcount = detail::refcount(combined_refcount);
+      bool has_pyobject = detail::has_pyobject(combined_refcount);
+      if (new_refcount == 0) {
+        bool should_delete = detail::weakcount(combined_refcount) == 1;
        // See comment above about weakcount. As long as refcount>0,
        // weakcount is one larger than the actual number of weak references.
        // So we need to decrement it here.
@ -356,6 +420,18 @@ class intrusive_ptr final {
        if (should_delete) {
          delete target_;
        }
+      } else if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
+        // If the refcount transitioned from 2 to 1, we need to decref the
+        // PyObject. In other words, we don't want to keep the PyObject alive if
+        // there are no C++ references to this object other than the PyObject
+        // itself.
+        if (C10_UNLIKELY(has_pyobject && new_refcount == 1)) {
+          target_->decref_pyobject();
+        }
+      } else {
+        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+            !has_pyobject,
+            "TargetTraits indicates that type cannot have PyObject, but refcount has PyObject bit set.");
      }
    }
  }
@ -522,6 +598,16 @@ class intrusive_ptr final {
    return use_count() == 1;
  }

+  /**
+   * Stronger than unique() in that it must not have any weakrefs as well.
+   */
+  bool is_uniquely_owned() const noexcept {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(target_ != NullType::singleton());
+    uint64_t combined =
+        target_->combined_refcount_.load(std::memory_order_acquire);
+    return (combined & ~detail::kHasPyObject) == detail::kUniqueRef;
+  }
+
  /**
   * Returns an owning (!) pointer to the underlying object and makes the
   * intrusive_ptr instance invalid. That means the refcount is not decreased.
@ -932,6 +1018,7 @@ class weak_intrusive_ptr final {
    if (target_ == NullType::singleton()) {
      return intrusive_ptr<TTarget, NullType>();
    } else {
+      bool increfed = false;
      auto combined_refcount =
          target_->combined_refcount_.load(std::memory_order_relaxed);
      do {
@ -940,12 +1027,31 @@ class weak_intrusive_ptr final {
          // Return nullptr.
          return intrusive_ptr<TTarget, NullType>();
        }
+        if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
+          if (detail::has_pyobject(combined_refcount) &&
+              detail::refcount(combined_refcount) == 1 && !increfed) {
+            // Object has a python wrapper with no other C++ references.
+            // We need to to incref the Python object before we acquire a
+            // strong reference to the C++ object to avoid a situation
+            // where the Python object is deallocated concurrently.
+            if (!target_->try_incref_pyobject()) {
+              return intrusive_ptr<TTarget, NullType>();
+            }
+            increfed = true;
+          }
+        }
      } while (!target_->combined_refcount_.compare_exchange_weak(
          combined_refcount,
          combined_refcount + detail::kReferenceCountOne,
          std::memory_order_acquire,
          std::memory_order_relaxed));

+      if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
+        if (increfed && detail::refcount(combined_refcount) != 1) {
+          target_->decref_pyobject();
+        }
+      }
+
      return intrusive_ptr<TTarget, NullType>(
          target_, raw::DontIncreaseRefcount{});
    }
@ -1060,7 +1166,14 @@ namespace intrusive_ptr {
 // NullType::singleton to this function
 inline void incref(intrusive_ptr_target* self) {
  if (self) {
-    detail::atomic_refcount_increment(self->combined_refcount_);
+    uint64_t combined = detail::atomic_combined_refcount_increment(
+        self->combined_refcount_, detail::kReferenceCountOne);
+
+    if (C10_UNLIKELY(
+            detail::has_pyobject(combined) &&
+            detail::refcount(combined) == 2)) {
+      self->incref_pyobject();
+    }
  }
 }

--- a/c10/util/logging_common.h
+++ b/c10/util/logging_common.h
@ -0,0 +1,74 @@
+#ifndef C10_UTIL_LOGGING_COMMON_H_
+#define C10_UTIL_LOGGING_COMMON_H_
+
+#include <c10/macros/Export.h>
+#include <sstream>
+
+namespace c10 {
+
+// MessageLogger that throws exceptions instead of aborting (glog version)
+// or logs and may abort (non-glog version).
+class C10_API MessageLogger {
+ public:
+  MessageLogger(
+      const char* file,
+      int line,
+      int severity,
+      bool exit_on_fatal = true);
+  ~MessageLogger() noexcept(false);
+
+  // Return the stream associated with the logger object.
+  std::stringstream& stream();
+
+ private:
+  // When there is a fatal log, and fatal == true, we abort
+  // otherwise, we throw.
+  void DealWithFatal();
+
+#if defined(ANDROID) && !defined(C10_USE_GLOG)
+  const char* tag_{"native"};
+#endif
+  std::stringstream stream_;
+  int severity_;
+  bool exit_on_fatal_;
+};
+
+// This class is used to explicitly ignore values in the conditional
+// logging macros. This avoids compiler warnings like "value computed
+// is not used" and "statement has no effect".
+class C10_API LoggerVoidify {
+ public:
+  LoggerVoidify() = default;
+  // This has to be an operator with a precedence lower than << but
+  // higher than ?:
+  void operator&(const std::ostream& s [[maybe_unused]]) {}
+};
+
+// Forward declarations for CheckNotNull functions
+template <typename T>
+T& CheckNotNullCommon(
+    const char* file,
+    int line,
+    const char* names,
+    T& t,
+    bool fatal = true);
+
+template <typename T>
+T* CheckNotNull(
+    const char* file,
+    int line,
+    const char* names,
+    T* t,
+    bool fatal = true);
+
+template <typename T>
+T& CheckNotNull(
+    const char* file,
+    int line,
+    const char* names,
+    T& t,
+    bool fatal = true);
+
+} // namespace c10
+
+#endif // C10_UTIL_LOGGING_COMMON_H_
--- a/c10/util/logging_is_google_glog.h
+++ b/c10/util/logging_is_google_glog.h
@ -47,57 +47,53 @@ INSTANTIATE_FOR_CONTAINER(set)

 #endif

+#include <c10/util/logging_common.h>
 #include <glog/logging.h>

-// Additional macros on top of glog
-#define TORCH_CHECK_EQ(val1, val2) CHECK_EQ(val1, val2)
-#define TORCH_CHECK_NE(val1, val2) CHECK_NE(val1, val2)
-#define TORCH_CHECK_LE(val1, val2) CHECK_LE(val1, val2)
-#define TORCH_CHECK_LT(val1, val2) CHECK_LT(val1, val2)
-#define TORCH_CHECK_GE(val1, val2) CHECK_GE(val1, val2)
-#define TORCH_CHECK_GT(val1, val2) CHECK_GT(val1, val2)
+namespace c10 {

-#ifndef NDEBUG
-#define TORCH_DCHECK_EQ(val1, val2) DCHECK_EQ(val1, val2)
-#define TORCH_DCHECK_NE(val1, val2) DCHECK_NE(val1, val2)
-#define TORCH_DCHECK_LE(val1, val2) DCHECK_LE(val1, val2)
-#define TORCH_DCHECK_LT(val1, val2) DCHECK_LT(val1, val2)
-#define TORCH_DCHECK_GE(val1, val2) DCHECK_GE(val1, val2)
-#define TORCH_DCHECK_GT(val1, val2) DCHECK_GT(val1, val2)
-#else // !NDEBUG
-// These versions generate no code in optimized mode.
-#define TORCH_DCHECK_EQ(val1, val2) \
-  while (false)                     \
-  DCHECK_EQ(val1, val2)
-#define TORCH_DCHECK_NE(val1, val2) \
-  while (false)                     \
-  DCHECK_NE(val1, val2)
-#define TORCH_DCHECK_LE(val1, val2) \
-  while (false)                     \
-  DCHECK_LE(val1, val2)
-#define TORCH_DCHECK_LT(val1, val2) \
-  while (false)                     \
-  DCHECK_LT(val1, val2)
-#define TORCH_DCHECK_GE(val1, val2) \
-  while (false)                     \
-  DCHECK_GE(val1, val2)
-#define TORCH_DCHECK_GT(val1, val2) \
-  while (false)                     \
-  DCHECK_GT(val1, val2)
-#endif // NDEBUG
+[[noreturn]] void ThrowEnforceNotMet(
+    const char* file,
+    const int line,
+    const char* condition,
+    const std::string& msg,
+    const void* caller);

-// Check that a pointer is not null.
-#define TORCH_CHECK_NOTNULL(val) CHECK_NOTNULL(val)
+template <typename T>
+T& CheckNotNullCommon(
+    const char* file,
+    int line,
+    const char* names,
+    T& t,
+    bool fatal) {
+  if (t == nullptr) {
+    MessageLogger(file, line, ::google::GLOG_FATAL, fatal).stream()
+        << "Check failed: '" << names << "' must be non NULL. ";
+  }
+  return t;
+}

-#ifndef NDEBUG
-// Debug only version of TORCH_CHECK_NOTNULL
-#define TORCH_DCHECK_NOTNULL(val) DCHECK_NOTNULL(val)
-#else // !NDEBUG
-// Optimized version - generates no code.
-#define TORCH_DCHECK_NOTNULL(val) \
-  while (false)                   \
-  DCHECK_NOTNULL(val)
-#endif // NDEBUG
+template <typename T>
+T* CheckNotNull(
+    const char* file,
+    int line,
+    const char* names,
+    T* t,
+    bool fatal) {
+  return CheckNotNullCommon(file, line, names, t, fatal);
+}
+
+template <typename T>
+T& CheckNotNull(
+    const char* file,
+    int line,
+    const char* names,
+    T& t,
+    bool fatal) {
+  return CheckNotNullCommon(file, line, names, t, fatal);
+}
+
+} // namespace c10

 // Log with source location information override (to be used in generic
 // warning/error handlers implemented as functions, not macros)
--- a/c10/util/logging_is_not_google_glog.h
+++ b/c10/util/logging_is_not_google_glog.h
@ -13,6 +13,7 @@
 #include <vector>

 #include <c10/util/Flags.h>
+#include <c10/util/logging_common.h>

 const char CAFFE2_SEVERITY_PREFIX[] = "FEWIV";

@ -24,61 +25,40 @@ const int GLOG_ERROR = 2;
 const int GLOG_WARNING = 1;
 const int GLOG_INFO = 0;

-class C10_API MessageLogger {
- public:
-  MessageLogger(const char* file, int line, int severity);
-  ~MessageLogger();
-  // Return the stream associated with the logger object.
-  std::stringstream& stream() {
-    return stream_;
-  }
-
- private:
-  // When there is a fatal log, we simply abort.
-  void DealWithFatal() {
-    abort();
-  }
-
-  const char* tag_;
-  std::stringstream stream_;
-  int severity_;
-};
-
-// This class is used to explicitly ignore values in the conditional
-// logging macros.  This avoids compiler warnings like "value computed
-// is not used" and "statement has no effect".
-class C10_API LoggerVoidify {
- public:
-  LoggerVoidify() = default;
-  // This has to be an operator with a precedence lower than << but
-  // higher than ?:
-  void operator&(const std::ostream& s [[maybe_unused]]) {}
-};
-
-// Log a message and terminate.
-template <class T>
-void LogMessageFatal(const char* file, int line, const T& message) {
-  MessageLogger(file, line, GLOG_FATAL).stream() << message;
-}
-
 // Helpers for TORCH_CHECK_NOTNULL(). Two are necessary to support both raw
 // pointers and smart pointers.
 template <typename T>
-T& CheckNotNullCommon(const char* file, int line, const char* names, T& t) {
+T& CheckNotNullCommon(
+    const char* file,
+    int line,
+    const char* names,
+    T& t,
+    bool fatal) {
  if (t == nullptr) {
-    LogMessageFatal(file, line, std::string(names));
+    MessageLogger(file, line, GLOG_FATAL, fatal).stream()
+        << "Check failed: '" << names << "' must be non NULL. ";
  }
  return t;
 }

 template <typename T>
-T* CheckNotNull(const char* file, int line, const char* names, T* t) {
-  return CheckNotNullCommon(file, line, names, t);
+T* CheckNotNull(
+    const char* file,
+    int line,
+    const char* names,
+    T* t,
+    bool fatal) {
+  return CheckNotNullCommon(file, line, names, t, fatal);
 }

 template <typename T>
-T& CheckNotNull(const char* file, int line, const char* names, T& t) {
-  return CheckNotNullCommon(file, line, names, t);
+T& CheckNotNull(
+    const char* file,
+    int line,
+    const char* names,
+    T& t,
+    bool fatal) {
+  return CheckNotNullCommon(file, line, names, t, fatal);
 }
 } // namespace c10

@ -136,65 +116,6 @@ static_assert(
          ::c10::MessageLogger(__FILE__, __LINE__, ::c10::GLOG_##n).stream()
 #endif // NDEBUG

-#define TORCH_CHECK_OP(val1, val2, op)                                        \
-  FATAL_IF(((val1)op(val2))) << "Check failed: " #val1 " " #op " " #val2 " (" \
-                             << (val1) << " vs. " << (val2) << ") "
-
-// TORCH_CHECK_OP macro definitions
-#define TORCH_CHECK_EQ(val1, val2) TORCH_CHECK_OP(val1, val2, ==)
-#define TORCH_CHECK_NE(val1, val2) TORCH_CHECK_OP(val1, val2, !=)
-#define TORCH_CHECK_LE(val1, val2) TORCH_CHECK_OP(val1, val2, <=)
-#define TORCH_CHECK_LT(val1, val2) TORCH_CHECK_OP(val1, val2, <)
-#define TORCH_CHECK_GE(val1, val2) TORCH_CHECK_OP(val1, val2, >=)
-#define TORCH_CHECK_GT(val1, val2) TORCH_CHECK_OP(val1, val2, >)
-
-#ifndef NDEBUG
-// Debug only versions of TORCH_CHECK_OP macros.
-#define TORCH_DCHECK_EQ(val1, val2) TORCH_CHECK_OP(val1, val2, ==)
-#define TORCH_DCHECK_NE(val1, val2) TORCH_CHECK_OP(val1, val2, !=)
-#define TORCH_DCHECK_LE(val1, val2) TORCH_CHECK_OP(val1, val2, <=)
-#define TORCH_DCHECK_LT(val1, val2) TORCH_CHECK_OP(val1, val2, <)
-#define TORCH_DCHECK_GE(val1, val2) TORCH_CHECK_OP(val1, val2, >=)
-#define TORCH_DCHECK_GT(val1, val2) TORCH_CHECK_OP(val1, val2, >)
-#else // !NDEBUG
-// These versions generate no code in optimized mode.
-#define TORCH_DCHECK_EQ(val1, val2) \
-  while (false)                     \
-  TORCH_CHECK_OP(val1, val2, ==)
-#define TORCH_DCHECK_NE(val1, val2) \
-  while (false)                     \
-  TORCH_CHECK_OP(val1, val2, !=)
-#define TORCH_DCHECK_LE(val1, val2) \
-  while (false)                     \
-  TORCH_CHECK_OP(val1, val2, <=)
-#define TORCH_DCHECK_LT(val1, val2) \
-  while (false)                     \
-  TORCH_CHECK_OP(val1, val2, <)
-#define TORCH_DCHECK_GE(val1, val2) \
-  while (false)                     \
-  TORCH_CHECK_OP(val1, val2, >=)
-#define TORCH_DCHECK_GT(val1, val2) \
-  while (false)                     \
-  TORCH_CHECK_OP(val1, val2, >)
-#endif // NDEBUG
-
-// Check that a pointer is not null.
-#define TORCH_CHECK_NOTNULL(val) \
-  ::c10::CheckNotNull(           \
-      __FILE__, __LINE__, "Check failed: '" #val "' Must be non NULL", (val))
-
-#ifndef NDEBUG
-// Debug only version of TORCH_CHECK_NOTNULL
-#define TORCH_DCHECK_NOTNULL(val) \
-  ::c10::CheckNotNull(            \
-      __FILE__, __LINE__, "Check failed: '" #val "' Must be non NULL", (val))
-#else // !NDEBUG
-// Optimized version - generates no code.
-#define TORCH_DCHECK_NOTNULL(val) \
-  while (false)                   \
-  TORCH_CHECK_NOTNULL(val)
-#endif // NDEBUG
-
 // ---------------------- Support for std objects --------------------------
 // These are adapted from glog to support a limited set of logging capability
 // for STL objects.
--- a/c10/xpu/XPUCachingAllocator.cpp
+++ b/c10/xpu/XPUCachingAllocator.cpp
@ -926,15 +926,14 @@ class DeviceCachingAllocator {
          (release_cached_blocks() && alloc_block(params, true));
    }
    if (!block_found) {
-      c10::xpu::DeviceProp device_prop;
-      c10::xpu::get_device_properties(&device_prop, device);
-      auto device_total = device_prop.global_mem_size;
+      const auto& raw_device = c10::xpu::get_raw_device(device);
+      const auto device_total =
+          raw_device.get_info<sycl::info::device::global_mem_size>();
      // Estimate the available device memory when the SYCL runtime does not
      // support the corresponding aspect (ext_intel_free_memory).
-      size_t device_free = device_prop.global_mem_size -
+      size_t device_free = device_total -
          stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)]
              .current;
-      auto& raw_device = c10::xpu::get_raw_device(device);
      // TODO: Remove the aspect check once the SYCL runtime bug is fixed on
      // affected devices.
      if (raw_device.has(sycl::aspect::ext_intel_free_memory)) {
@ -1052,21 +1051,37 @@ class DeviceCachingAllocator {
    }
  }

+  std::pair<size_t, size_t> getMemoryInfo() {
+    const auto& device = c10::xpu::get_raw_device(device_index);
+    const size_t total = device.get_info<sycl::info::device::global_mem_size>();
+    TORCH_CHECK(
+        device.has(sycl::aspect::ext_intel_free_memory),
+        "The device (",
+        device.get_info<sycl::info::device::name>(),
+        ") doesn't support querying the available free memory. ",
+        "You can file an issue at https://github.com/pytorch/pytorch/issues ",
+        "to help us prioritize its implementation.");
+    const size_t free =
+        device.get_info<sycl::ext::intel::info::device::free_memory>();
+    return {free, total};
+  }
+
  double getMemoryFraction() {
    if (!set_fraction) {
      return 1.0;
    }

-    c10::xpu::DeviceProp device_prop;
-    c10::xpu::get_device_properties(&device_prop, device_index);
+    const auto device_total =
+        xpu::get_raw_device(device_index)
+            .get_info<sycl::info::device::global_mem_size>();
    return static_cast<double>(allowed_memory_maximum) /
-        static_cast<double>(device_prop.global_mem_size);
+        static_cast<double>(device_total);
  }

  void setMemoryFraction(double fraction) {
-    c10::xpu::DeviceProp device_prop;
-    c10::xpu::get_device_properties(&device_prop, device_index);
-    auto device_total = device_prop.global_mem_size;
+    const auto device_total =
+        xpu::get_raw_device(device_index)
+            .get_info<sycl::info::device::global_mem_size>();
    allowed_memory_maximum = static_cast<size_t>(fraction * device_total);
    set_fraction = true;
  }
@ -1240,6 +1255,11 @@ class XPUAllocator : public DeviceAllocator {
        c10::xpu::get_raw_device(dev_to_access));
  }

+  std::pair<size_t, size_t> getMemoryInfo(DeviceIndex device) override {
+    assertValidDevice(device);
+    return device_allocators[device]->getMemoryInfo();
+  }
+
  double getMemoryFraction(DeviceIndex device) {
    assertValidDevice(device);
    return device_allocators[device]->getMemoryFraction();
--- a/cmake/public/utils.cmake
+++ b/cmake/public/utils.cmake
@ -478,6 +478,7 @@ function(torch_update_find_cuda_flags)
 endfunction()

 include(CheckCXXCompilerFlag)
+include(CheckCCompilerFlag)
 include(CheckLinkerFlag)

 ##############################################################################
@ -501,6 +502,24 @@ function(append_cxx_flag_if_supported flag outputvar)
    endif()
 endfunction()

+function(append_c_flag_if_supported flag outputvar)
+    string(TOUPPER "HAS${flag}" _FLAG_NAME)
+    string(REGEX REPLACE "[=-]" "_" _FLAG_NAME "${_FLAG_NAME}")
+
+    # GCC silences unknown -Wno-XXX flags, so test the corresponding -WXXX.
+    if(CMAKE_C_COMPILER_ID STREQUAL "GNU")
+        string(REGEX REPLACE "^Wno-" "W" new_flag "${flag}")
+    else()
+        set(new_flag "${flag}")
+    endif()
+
+    check_c_compiler_flag("${new_flag}" ${_FLAG_NAME})
+    if(${_FLAG_NAME})
+        string(APPEND ${outputvar} " ${flag}")
+        set(${outputvar} "${${outputvar}}" PARENT_SCOPE)
+    endif()
+endfunction()
+
 function(target_compile_options_if_supported target flag)
  set(_compile_options "")
  append_cxx_flag_if_supported("${flag}" _compile_options)
--- a/docs/source/accelerator.md
+++ b/docs/source/accelerator.md
@ -40,6 +40,7 @@
    :nosignatures:

     empty_cache
+     get_memory_info
     max_memory_allocated
     max_memory_reserved
     memory_allocated
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -382,20 +382,6 @@ coverage_ignore_functions = [
    # torch.ao.quantization.backend_config.tensorrt
    "get_tensorrt_backend_config",
    "get_tensorrt_backend_config_dict",
-    # torch.ao.quantization.backend_config.utils
-    "entry_to_pretty_str",
-    "get_fused_module_classes",
-    "get_fuser_method_mapping",
-    "get_fusion_pattern_to_extra_inputs_getter",
-    "get_fusion_pattern_to_root_node_getter",
-    "get_module_to_qat_module",
-    "get_pattern_to_dtype_configs",
-    "get_pattern_to_input_type_to_index",
-    "get_qat_module_classes",
-    "get_root_module_to_quantized_reference_module",
-    "pattern_to_human_readable",
-    "remove_boolean_dispatch_from_name",
-    # torch.ao.quantization.backend_config.x86
    "get_x86_backend_config",
    # torch.ao.quantization.fuse_modules
    "fuse_known_modules",
@ -426,25 +412,6 @@ coverage_ignore_functions = [
    "insert_observers_for_model",
    "prepare",
    "propagate_dtypes_for_known_nodes",
-    # torch.ao.quantization.fx.utils
-    "all_node_args_except_first",
-    "all_node_args_have_no_tensors",
-    "assert_and_get_unique_device",
-    "collect_producer_nodes",
-    "create_getattr_from_value",
-    "create_node_from_old_node_preserve_meta",
-    "get_custom_module_class_keys",
-    "get_linear_prepack_op_for_dtype",
-    "get_new_attr_name_with_prefix",
-    "get_non_observable_arg_indexes_and_types",
-    "get_qconv_prepack_op",
-    "get_skipped_module_name_and_classes",
-    "graph_module_from_producer_nodes",
-    "maybe_get_next_module",
-    "node_arg_is_bias",
-    "node_arg_is_weight",
-    "return_arg_list",
-    # torch.ao.quantization.pt2e.graph_utils
    "bfs_trace_with_node_process",
    "find_sequential_partitions",
    "get_equivalent_types",
@ -860,80 +827,10 @@ coverage_ignore_functions = [
    "get_latency_of_one_partition",
    "get_latency_of_partitioned_graph",
    "get_partition_to_latency_mapping",
-    # torch.fx.experimental.proxy_tensor
-    "decompose",
-    "disable_autocast_cache",
-    "disable_proxy_modes_tracing",
-    "dispatch_trace",
-    "extract_val",
-    "fake_signature",
-    "fetch_sym_proxy",
-    "fetch_object_proxy",
-    "get_innermost_proxy_mode",
-    "get_isolated_graphmodule",
-    "get_proxy_slot",
-    "get_torch_dispatch_modes",
-    "has_proxy_slot",
-    "is_sym_node",
-    "maybe_handle_decomp",
-    "proxy_call",
-    "set_meta",
-    "set_original_aten_op",
-    "set_proxy_slot",
-    "snapshot_fake",
-    "thunkify",
-    "track_tensor",
-    "track_tensor_tree",
-    "wrap_key",
-    "wrapper_and_args_for_make_fx",
-    # torch.fx.experimental.recording
    "record_shapeenv_event",
    "replay_shape_env_events",
    "shape_env_check_state_equal",
-    # torch.fx.experimental.sym_node
-    "ceil_impl",
-    "floor_ceil_helper",
-    "floor_impl",
-    "method_to_operator",
-    "sympy_is_channels_last_contiguous_2d",
-    "sympy_is_channels_last_contiguous_3d",
-    "sympy_is_channels_last_strides_2d",
-    "sympy_is_channels_last_strides_3d",
-    "sympy_is_channels_last_strides_generic",
-    "sympy_is_contiguous",
-    "sympy_is_contiguous_generic",
-    "to_node",
-    "wrap_node",
    "sym_sqrt",
-    # torch.fx.experimental.symbolic_shapes
-    "bind_symbols",
-    "cast_symbool_to_symint_guardless",
-    "create_contiguous",
-    "error",
-    "eval_guards",
-    "eval_is_non_overlapping_and_dense",
-    "expect_true",
-    "find_symbol_binding_fx_nodes",
-    "free_symbols",
-    "free_unbacked_symbols",
-    "fx_placeholder_targets",
-    "fx_placeholder_vals",
-    "guard_bool",
-    "guard_float",
-    "guard_int",
-    "guard_scalar",
-    "has_hint",
-    "has_symbolic_sizes_strides",
-    "is_channels_last_contiguous_2d",
-    "is_channels_last_contiguous_3d",
-    "is_channels_last_strides_2d",
-    "is_channels_last_strides_3d",
-    "is_contiguous",
-    "is_non_overlapping_and_dense_indicator",
-    "is_nested_int",
-    "is_symbol_binding_fx_node",
-    "is_symbolic",
-    # torch.fx.experimental.unification.core
    "reify",
    # torch.fx.experimental.unification.match
    "edge",
@ -971,24 +868,6 @@ coverage_ignore_functions = [
    "reverse_dict",
    # torch.fx.experimental.unification.multipledispatch.variadic
    "isvariadic",
-    # torch.fx.experimental.unification.unification_tools
-    "assoc",
-    "assoc_in",
-    "dissoc",
-    "first",
-    "get_in",
-    "getter",
-    "groupby",
-    "itemfilter",
-    "itemmap",
-    "keyfilter",
-    "keymap",
-    "merge",
-    "merge_with",
-    "update_in",
-    "valfilter",
-    "valmap",
-    # torch.fx.experimental.unification.utils
    "freeze",
    "hashable",
    "raises",
--- a/docs/source/fx.experimental.md
+++ b/docs/source/fx.experimental.md
@ -12,6 +12,37 @@ These APIs are experimental and subject to change without notice.
 .. autoclass:: torch.fx.experimental.sym_node.DynamicInt
 ```

+## torch.fx.experimental.sym_node
+
+```{eval-rst}
+.. currentmodule:: torch.fx.experimental.sym_node
+```
+
+```{eval-rst}
+.. automodule:: torch.fx.experimental.sym_node
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    is_channels_last_contiguous_2d
+    is_channels_last_contiguous_3d
+    is_channels_last_strides_2d
+    is_channels_last_strides_3d
+    is_contiguous
+    is_non_overlapping_and_dense_indicator
+    method_to_operator
+    sympy_is_channels_last_contiguous_2d
+    sympy_is_channels_last_contiguous_3d
+    sympy_is_channels_last_strides_2d
+    sympy_is_channels_last_strides_3d
+    sympy_is_channels_last_strides_generic
+    sympy_is_contiguous
+    sympy_is_contiguous_generic
+```
+
 ## torch.fx.experimental.symbolic_shapes

 ```{eval-rst}
@ -69,6 +100,25 @@ These APIs are experimental and subject to change without notice.
    rebind_unbacked
    resolve_unbacked_bindings
    is_accessor_node
+    cast_symbool_to_symint_guardless
+    create_contiguous
+    error
+    eval_guards
+    eval_is_non_overlapping_and_dense
+    find_symbol_binding_fx_nodes
+    free_symbols
+    free_unbacked_symbols
+    fx_placeholder_targets
+    fx_placeholder_vals
+    guard_bool
+    guard_float
+    guard_int
+    guard_scalar
+    has_hint
+    has_symbolic_sizes_strides
+    is_nested_int
+    is_symbol_binding_fx_node
+    is_symbolic
 ```

 ## torch.fx.experimental.proxy_tensor
@ -91,4 +141,46 @@ These APIs are experimental and subject to change without notice.
    get_proxy_mode
    maybe_enable_thunkify
    maybe_disable_thunkify
+    decompose
+    disable_autocast_cache
+    disable_proxy_modes_tracing
+    extract_val
+    fake_signature
+    fetch_object_proxy
+    fetch_sym_proxy
+    has_proxy_slot
+    is_sym_node
+    maybe_handle_decomp
+    proxy_call
+    set_meta
+    set_original_aten_op
+    set_proxy_slot
+    snapshot_fake
 ```
+
+## torch.fx.experimental.unification.unification_tools
+
+```{eval-rst}
+.. currentmodule:: torch.fx.experimental.unification.unification_tools
+```
+
+```{eval-rst}
+.. automodule:: torch.fx.experimental.unification.unification_tools
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    assoc
+    assoc_in
+    dissoc
+    first
+    keyfilter
+    keymap
+    merge
+    merge_with
+    update_in
+    valfilter
+    valmap
--- a/docs/source/fx.md
+++ b/docs/source/fx.md
@ -1134,7 +1134,6 @@ The set of leaf modules can be customized by overriding
 .. py:module:: torch.fx.experimental.refinement_types
 .. py:module:: torch.fx.experimental.rewriter
 .. py:module:: torch.fx.experimental.schema_type_annotation
-.. py:module:: torch.fx.experimental.sym_node
 .. py:module:: torch.fx.experimental.unification.core
 .. py:module:: torch.fx.experimental.unification.dispatch
 .. py:module:: torch.fx.experimental.unification.match
@ -1144,7 +1143,6 @@ The set of leaf modules can be customized by overriding
 .. py:module:: torch.fx.experimental.unification.multipledispatch.dispatcher
 .. py:module:: torch.fx.experimental.unification.multipledispatch.utils
 .. py:module:: torch.fx.experimental.unification.multipledispatch.variadic
-.. py:module:: torch.fx.experimental.unification.unification_tools
 .. py:module:: torch.fx.experimental.unification.utils
 .. py:module:: torch.fx.experimental.unification.variable
 .. py:module:: torch.fx.experimental.unify_refinements
--- a/docs/source/quantization-support.md
+++ b/docs/source/quantization-support.md
@ -134,6 +134,23 @@ Quantization to work with this as well.
    ObservationType
 ```

+## torch.ao.quantization.backend_config.utils
+```{eval-rst}
+.. currentmodule:: torch.ao.quantization.backend_config.utils
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    entry_to_pretty_str
+    pattern_to_human_readable
+    remove_boolean_dispatch_from_name
+
+```
+
 ## torch.ao.quantization.fx.custom_config

 This module contains a few CustomConfig classes that's used in both eager mode and FX graph mode quantization
@ -154,6 +171,30 @@ This module contains a few CustomConfig classes that's used in both eager mode a
    StandaloneModuleConfigEntry
 ```

+## torch.ao.quantization.fx.utils
+
+```{eval-rst}
+.. currentmodule:: torch.ao.quantization.fx.utils
+```
+
+```{eval-rst}
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+    :template: classtemplate.rst
+
+    all_node_args_except_first
+    all_node_args_have_no_tensors
+    collect_producer_nodes
+    create_getattr_from_value
+    create_node_from_old_node_preserve_meta
+    graph_module_from_producer_nodes
+    maybe_get_next_module
+    node_arg_is_bias
+    node_arg_is_weight
+    return_arg_list
+```
+
 ## torch.ao.quantization.quantizer

 ```{eval-rst}
--- a/pyproject.toml
+++ b/pyproject.toml
@ -172,9 +172,9 @@ ignore = [
    "SIM102", "SIM103", "SIM112", # flake8-simplify code styles
    "SIM105", # these ignores are from flake8-simplify. please fix or ignore with commented reason
    "SIM108", # SIM108 ignored because we prefer if-else-block instead of ternary expression
-    "SIM110",
+    "SIM110", # Checks for for loops that can be replaced with a builtin function, like any or all.
    "SIM114", # Combine `if` branches using logical `or` operator
-    "SIM115",
+    "SIM115", # Checks for cases where files are opened without using a context manager.
    "SIM116", # Disable Use a dictionary instead of consecutive `if` statements
    "SIM117",
    "SIM118",
@ -184,7 +184,6 @@ ignore = [
    "TC006",
    # TODO: Remove Python-3.10 specific suppressions
    "B905",
-    "UP035",
 ]
 select = [
    "B",
@ -261,6 +260,7 @@ select = [
    "TRY401", # verbose-log-message
    "UP",
    "YTT",
+    "S101",
 ]

 [tool.ruff.lint.pyupgrade]
@ -340,6 +340,39 @@ keep-runtime-typing = true
 "tools/linter/**" = [
    "LOG015" # please fix
 ]
+"benchmarks/**" = [
+    "S101"
+]
+"test/**" = [
+    "S101"
+]
+"torchgen/**" = [
+    "S101"
+]
+"torch/**" = [
+    "S101"
+]
+"tools/**" = [
+    "S101"
+]
+"setup.py" = [
+    "S101"
+]
+"functorch/**" = [
+    "S101"
+]
+"docs/**" = [
+    "S101"
+]
+"android/**" = [
+    "S101"
+]
+".github/**" = [
+    "S101"
+]
+".ci/**" = [
+    "S101"
+]

 [tool.codespell]
 ignore-words = "tools/linter/dictionary.txt"
--- a/setup.py
+++ b/setup.py
@ -1646,8 +1646,7 @@ def main() -> None:
    mirror_files_into_torchgen()
    if RUN_BUILD_DEPS:
        build_deps()
-
-    mirror_inductor_external_kernels()
+        mirror_inductor_external_kernels()

    (
        ext_modules,
--- a/test/ao/sparsity/test_data_sparsifier.py
+++ b/test/ao/sparsity/test_data_sparsifier.py
@ -208,7 +208,7 @@ class _BaseDataSparsiferTestCase(TestCase):
        assert len(sparsifier1.data_groups) == len(sparsifier2.data_groups)

        state1 = state_dict1["state"]
-        for name in state1.keys():
+        for name in state1:
            # compare mask
            assert name in sparsifier2.state
            assert "mask" in sparsifier2.state[name]
--- a/test/ao/sparsity/test_sparsifier.py
+++ b/test/ao/sparsity/test_sparsifier.py
@ -119,7 +119,7 @@ class TestBaseSparsifier(TestCase):
        for idx in range(len(sparsifier0.groups)):
            mg0 = sparsifier0.groups[idx]
            mg1 = sparsifier1.groups[idx]
-            for key in mg0.keys():
+            for key in mg0:
                assert key in mg1
                if key == "module":
                    # We cannot compare modules as they are different
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
@ -1,5 +1,6 @@
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
 #include <torch/csrc/stable/accelerator.h>
+#include <torch/csrc/stable/device.h>
 #include <torch/csrc/stable/library.h>
 #include <torch/csrc/stable/tensor.h>
 #include <torch/csrc/stable/ops.h>
@ -67,13 +68,13 @@ Tensor sgd_out_of_place(

 void boxed_sgd_out_of_place(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
  Tensor res = sgd_out_of_place(
-    to<Tensor>(stack[0]),
-    to<Tensor>(stack[1]),
-    float(to<double>(stack[2])),
-    to<double>(stack[3]),
-    to<bool>(stack[4]));
+    torch::stable::detail::to<Tensor>(stack[0]),
+    torch::stable::detail::to<Tensor>(stack[1]),
+    float(torch::stable::detail::to<double>(stack[2])),
+    torch::stable::detail::to<double>(stack[3]),
+    torch::stable::detail::to<bool>(stack[4]));

-  stack[0] = from(res);
+  stack[0] = torch::stable::detail::from(res);
 }

 STABLE_TORCH_LIBRARY(libtorch_agnostic, m) {
@ -89,8 +90,8 @@ Tensor identity(Tensor t) {
 }

 void boxed_identity(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  Tensor res = identity(to<Tensor>(stack[0]));
-  stack[0] = from(res);
+  Tensor res = identity(torch::stable::detail::to<Tensor>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
 }

 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
@ -108,14 +109,14 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
 Tensor my_abs(Tensor t) {
  const auto num_args = 1;
  StableIValue stack[num_args];
-  stack[0] = from(t);
+  stack[0] = torch::stable::detail::from(t);
  aoti_torch_call_dispatcher("aten::abs", "", stack);
-  return to<Tensor>(stack[0]);
+  return torch::stable::detail::to<Tensor>(stack[0]);
 }

 void boxed_my_abs(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  Tensor tensor_res = my_abs(to<Tensor>(stack[0]));
-  stack[0] = from(tensor_res);
+  Tensor tensor_res = my_abs(torch::stable::detail::to<Tensor>(stack[0]));
+  stack[0] = torch::stable::detail::from(tensor_res);
 }

 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
@ -132,21 +133,21 @@ Tensor my_ones_like(Tensor t, StableIValue device) {

  auto mf = aoti_torch_memory_format_contiguous_format();

-  stack[0] = from(t);
-  stack[1] = from(std::optional(t.scalar_type()));    // dtype
-  stack[2] = from(std::nullopt);              // layout
-  stack[3] = from(std::optional(device));     // device
-  stack[4] = from(std::optional(false));      // pin_memory
-  stack[5] = from(std::optional(mf));         // memory_format
+  stack[0] = torch::stable::detail::from(t);
+  stack[1] = torch::stable::detail::from(std::optional(t.scalar_type()));    // dtype
+  stack[2] = torch::stable::detail::from(std::nullopt);              // layout
+  stack[3] = torch::stable::detail::from(std::optional(device));     // device
+  stack[4] = torch::stable::detail::from(std::optional(false));      // pin_memory
+  stack[5] = torch::stable::detail::from(std::optional(mf));         // memory_format

  aoti_torch_call_dispatcher("aten::ones_like", "", stack);

-  return to<Tensor>(stack[0]);
+  return torch::stable::detail::to<Tensor>(stack[0]);
 }

 void boxed_my_ones_like(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  Tensor res = my_ones_like(to<Tensor>(stack[0]), stack[1]);
-  stack[0] = from(res);
+  Tensor res = my_ones_like(torch::stable::detail::to<Tensor>(stack[0]), stack[1]);
+  stack[0] = torch::stable::detail::from(res);
 }

 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
@ -159,28 +160,28 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {

 std::tuple<Tensor, Tensor, bool> exp_neg_is_leaf(Tensor t1, Tensor t2, Tensor t3) {
  StableIValue stack_exp[1];
-  stack_exp[0] = from(t1);
+  stack_exp[0] = torch::stable::detail::from(t1);
  aoti_torch_call_dispatcher("aten::exp", "", stack_exp);

  StableIValue stack_neg[1];
-  stack_neg[0] = from(t2);
+  stack_neg[0] = torch::stable::detail::from(t2);
  aoti_torch_call_dispatcher("aten::neg", "", stack_neg);

  StableIValue stack_is_leaf[1];
-  stack_is_leaf[0] = from(t3);
+  stack_is_leaf[0] = torch::stable::detail::from(t3);
  aoti_torch_call_dispatcher("aten::is_leaf", "", stack_is_leaf);

  return std::make_tuple(
-    to<Tensor>(stack_exp[0]),
-    to<Tensor>(stack_neg[0]),
-    to<bool>(stack_is_leaf[0]));
+    torch::stable::detail::to<Tensor>(stack_exp[0]),
+    torch::stable::detail::to<Tensor>(stack_neg[0]),
+    torch::stable::detail::to<bool>(stack_is_leaf[0]));
 }

 void boxed_exp_neg_is_leaf(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  auto tuple = exp_neg_is_leaf(to<Tensor>(stack[0]), to<Tensor>(stack[1]), to<Tensor>(stack[2]));
-  stack[0] = from(std::get<0>(tuple));
-  stack[1] = from(std::get<1>(tuple));
-  stack[2] = from(std::get<2>(tuple));
+  auto tuple = exp_neg_is_leaf(torch::stable::detail::to<Tensor>(stack[0]), torch::stable::detail::to<Tensor>(stack[1]), torch::stable::detail::to<Tensor>(stack[2]));
+  stack[0] = torch::stable::detail::from(std::get<0>(tuple));
+  stack[1] = torch::stable::detail::from(std::get<1>(tuple));
+  stack[2] = torch::stable::detail::from(std::get<2>(tuple));
 }

 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
@ -193,15 +194,15 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {

 Tensor neg_exp(Tensor t) {
  StableIValue stack[1];
-  stack[0] = from(t);
+  stack[0] = torch::stable::detail::from(t);
  aoti_torch_call_dispatcher("aten::exp", "", stack);
  aoti_torch_call_dispatcher("aten::neg", "", stack);
-  return to<Tensor>(stack[0]);
+  return torch::stable::detail::to<Tensor>(stack[0]);
 }

 void boxed_neg_exp(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  Tensor res = neg_exp(to<Tensor>(stack[0]));
-  stack[0] = from(res);
+  Tensor res = neg_exp(torch::stable::detail::to<Tensor>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
 }

 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
@ -214,10 +215,10 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {

 Tensor divide_neg_exp(Tensor t) {
  StableIValue stack_neg[1];
-  stack_neg[0] = from(t);
+  stack_neg[0] = torch::stable::detail::from(t);

  StableIValue stack_exp[1];
-  stack_exp[0] = from(t);
+  stack_exp[0] = torch::stable::detail::from(t);
  aoti_torch_call_dispatcher("aten::exp", "", stack_exp);
  aoti_torch_call_dispatcher("aten::neg", "", stack_neg);

@ -225,12 +226,12 @@ Tensor divide_neg_exp(Tensor t) {
  stack_div[0] = stack_neg[0];
  stack_div[1] = stack_exp[0];
  aoti_torch_call_dispatcher("aten::divide", "Tensor", stack_div);
-  return to<Tensor>(stack_div[0]);
+  return torch::stable::detail::to<Tensor>(stack_div[0]);
 }

 void boxed_divide_neg_exp(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  Tensor res = divide_neg_exp(to<Tensor>(stack[0]));
-  stack[0] = from(res);
+  Tensor res = divide_neg_exp(torch::stable::detail::to<Tensor>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
 }

 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
@ -246,8 +247,8 @@ bool is_contiguous(Tensor t) {
 }

 void boxed_is_contiguous(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  bool res = is_contiguous(to<Tensor>(stack[0]));
-  stack[0] = from(res);
+  bool res = is_contiguous(torch::stable::detail::to<Tensor>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
 }

 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
@ -263,9 +264,9 @@ Tensor my_transpose(Tensor t, int64_t dim0, int64_t dim1) {
 }

 void boxed_my_transpose(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  auto res = my_transpose(to<Tensor>(stack[0]), to<int64_t>(stack[1]), to<int64_t>(stack[2]));
+  auto res = my_transpose(torch::stable::detail::to<Tensor>(stack[0]), torch::stable::detail::to<int64_t>(stack[1]), torch::stable::detail::to<int64_t>(stack[2]));

-  stack[0] = from(res);
+  stack[0] = torch::stable::detail::from(res);
 }

 Tensor my_empty_like(Tensor t) {
@ -273,8 +274,8 @@ Tensor my_empty_like(Tensor t) {
 }

 void boxed_empty_like(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  auto res = my_empty_like(to<Tensor>(stack[0]));
-  stack[0] = from(res);
+  auto res = my_empty_like(torch::stable::detail::to<Tensor>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
 }

 bool my_is_cpu(Tensor t) {
@ -283,8 +284,8 @@ bool my_is_cpu(Tensor t) {


 void boxed_my_is_cpu(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  auto res = my_is_cpu(to<Tensor>(stack[0]));
-  stack[0] = from(res);
+  auto res = my_is_cpu(torch::stable::detail::to<Tensor>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
 }

 Tensor fill_infinity(Tensor t) {
@ -296,8 +297,8 @@ void boxed_fill_infinity(
    StableIValue* stack,
    uint64_t num_args,
    uint64_t num_outputs) {
-  auto res = fill_infinity(to<Tensor>(stack[0]));
-  stack[0] = from(res);
+  auto res = fill_infinity(torch::stable::detail::to<Tensor>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
 }

 Tensor my_pad(Tensor t) {
@ -310,8 +311,8 @@ void boxed_my_pad(
    StableIValue* stack,
    uint64_t num_args,
    uint64_t num_outputs) {
-  auto res = my_pad(to<Tensor>(stack[0]));
-  stack[0] = from(res);
+  auto res = my_pad(torch::stable::detail::to<Tensor>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
 }

 Tensor my_narrow(Tensor t, int64_t dim, int64_t start, int64_t length) {
@ -323,11 +324,11 @@ void boxed_my_narrow(
    uint64_t num_args,
    uint64_t num_outputs) {
  auto res = my_narrow(
-      to<Tensor>(stack[0]),
-      to<int64_t>(stack[1]),
-      to<int64_t>(stack[2]),
-      to<int64_t>(stack[3]));
-  stack[0] = from(res);
+      torch::stable::detail::to<Tensor>(stack[0]),
+      torch::stable::detail::to<int64_t>(stack[1]),
+      torch::stable::detail::to<int64_t>(stack[2]),
+      torch::stable::detail::to<int64_t>(stack[3]));
+  stack[0] = torch::stable::detail::from(res);
 }

 Tensor my_new_empty_dtype_variant(Tensor t) {
@ -342,8 +343,8 @@ Tensor my_new_empty_dtype_variant(Tensor t) {
 }

 void boxed_my_new_empty_dtype_variant(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  auto res = my_new_empty_dtype_variant(to<Tensor>(stack[0]));
-  stack[0] = from(res);
+  auto res = my_new_empty_dtype_variant(torch::stable::detail::to<Tensor>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
 }

 Tensor my_new_zeros_dtype_variant(Tensor t) {
@ -352,8 +353,8 @@ Tensor my_new_zeros_dtype_variant(Tensor t) {
 }

 void boxed_my_new_zeros_dtype_variant(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  auto res = my_new_zeros_dtype_variant(to<Tensor>(stack[0]));
-  stack[0] = from(res);
+  auto res = my_new_zeros_dtype_variant(torch::stable::detail::to<Tensor>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
 }

 Tensor my_copy_(Tensor dst, Tensor src, bool non_blocking) {
@ -361,8 +362,8 @@ Tensor my_copy_(Tensor dst, Tensor src, bool non_blocking) {
 }

 void boxed_my_copy_(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  Tensor tensor_res = my_copy_(to<Tensor>(stack[0]), to<Tensor>(stack[1]), to<bool>(stack[2]));
-  stack[0] = from(tensor_res);
+  Tensor tensor_res = my_copy_(torch::stable::detail::to<Tensor>(stack[0]), torch::stable::detail::to<Tensor>(stack[1]), torch::stable::detail::to<bool>(stack[2]));
+  stack[0] = torch::stable::detail::from(tensor_res);
 }

 Tensor my_clone(Tensor t) {
@ -370,8 +371,8 @@ Tensor my_clone(Tensor t) {
 }

 void boxed_my_clone(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  Tensor tensor_res = my_clone(to<Tensor>(stack[0]));
-  stack[0] = from(tensor_res);
+  Tensor tensor_res = my_clone(torch::stable::detail::to<Tensor>(stack[0]));
+  stack[0] = torch::stable::detail::from(tensor_res);
 }


@ -408,8 +409,8 @@ Tensor my_zero_(Tensor t) {
 }

 void boxed_my_zero_(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  auto res = my_zero_(to<Tensor>(stack[0]));
-  stack[0] = from(res);
+  auto res = my_zero_(torch::stable::detail::to<Tensor>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
 }

 Tensor my_amax(Tensor t) {
@ -417,8 +418,8 @@ Tensor my_amax(Tensor t) {
 }

 void boxed_my_amax(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  auto res = my_amax(to<Tensor>(stack[0]));
-  stack[0] = from(res);
+  auto res = my_amax(torch::stable::detail::to<Tensor>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
 }

 Tensor my_amax_vec(Tensor t) {
@ -426,8 +427,8 @@ Tensor my_amax_vec(Tensor t) {
 }

 void boxed_my_amax_vec(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  auto res = my_amax_vec(to<Tensor>(stack[0]));
-  stack[0] = from(res);
+  auto res = my_amax_vec(torch::stable::detail::to<Tensor>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
 }

 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
@ -464,8 +465,8 @@ void boxed_test_default_constructor(
    StableIValue* stack,
    uint64_t num_args,
    uint64_t num_outputs) {
-  bool res = test_default_constructor(to<bool>(stack[0]));
-  stack[0] = from(res);
+  bool res = test_default_constructor(torch::stable::detail::to<bool>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
 }

 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
@ -478,6 +479,181 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
  m.impl("my_amax_vec", &boxed_my_amax_vec);
 }

+std::vector<Tensor> my__foreach_mul(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
+  std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
+  aoti_torch_call_dispatcher("aten::_foreach_mul", "List", stack.data());
+  return torch::stable::detail::to<std::vector<Tensor>>(stack[0]);
+}
+
+void boxed_my__foreach_mul(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+  // Why is the following NOT torch::stable::detail::to<HeaderOnlyArrayRef<Tensor>>(stack[0])? Because calling `to`
+  // on a StableIValue means that the result is owning its underlying data now! HeaderOnlyArrayRef
+  // is not owning, so it cannot safely steward the result of the torch::stable::detail::to<>.
+  auto res = my__foreach_mul(torch::stable::detail::to<std::vector<Tensor>>(stack[0]), torch::stable::detail::to<std::vector<Tensor>>(stack[1]));
+  stack[0] = torch::stable::detail::from(res);
+}
+
+void my__foreach_mul_(torch::headeronly::HeaderOnlyArrayRef<Tensor> self, torch::headeronly::HeaderOnlyArrayRef<Tensor> other) {
+  std::array<StableIValue, 2> stack = {torch::stable::detail::from(self), torch::stable::detail::from(other)};
+  aoti_torch_call_dispatcher("aten::_foreach_mul_", "List", stack.data());
+}
+
+void boxed_my__foreach_mul_(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+  my__foreach_mul_(torch::stable::detail::to<std::vector<Tensor>>(stack[0]), torch::stable::detail::to<std::vector<Tensor>>(stack[1]));
+}
+
+std::vector<Tensor> make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) {
+  // This function tests that my__foreach_mul can take in std::initializer_lists
+  // in addition to std::vectors.
+  Tensor t1_1 = my_clone(t1);
+  Tensor t1_2 = my_clone(t1);
+  Tensor t2_1 = my_clone(t2);
+  Tensor t2_2 = my_clone(t2);
+  return my__foreach_mul({t1_1, t2_1}, {t1_2, t2_2});
+}
+
+void boxed_make_tensor_clones_and_call_foreach(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
+  auto res = make_tensor_clones_and_call_foreach(torch::stable::detail::to<Tensor>(stack[0]), torch::stable::detail::to<Tensor>(stack[1]));
+  stack[0] = torch::stable::detail::from(res);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+  m.def("my__foreach_mul(Tensor[] self, Tensor[] other) -> Tensor[]");
+  m.def("my__foreach_mul_(Tensor(a!)[] self, Tensor[] other) -> ()");
+  m.def("make_tensor_clones_and_call_foreach(Tensor t1, Tensor t2) -> Tensor[]");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+  m.impl("my__foreach_mul", &boxed_my__foreach_mul);
+  m.impl("my__foreach_mul_", &boxed_my__foreach_mul_);
+  m.impl("make_tensor_clones_and_call_foreach", &boxed_make_tensor_clones_and_call_foreach);
+}
+
+// Test functions for torch::stable::Device
+
+torch::stable::Device test_device_constructor(
+    bool is_cuda,
+    torch::stable::DeviceIndex index,
+    bool use_str) {
+  using torch::stable::Device;
+  using torch::stable::DeviceType;
+
+  if (use_str) {
+    std::string device_str;
+    if (is_cuda) {
+      device_str = "cuda:" + std::to_string(index);
+    } else {
+      device_str = "cpu";
+    }
+    return Device(device_str);
+  } else {
+    if (is_cuda) {
+      return Device(DeviceType::CUDA, index);
+    } else {
+      return Device(DeviceType::CPU);
+    }
+  }
+}
+
+void boxed_test_device_constructor(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
+  torch::stable::Device res = test_device_constructor(
+      torch::stable::detail::to<bool>(stack[0]),
+      torch::stable::detail::to<torch::stable::DeviceIndex>(stack[1]),
+      torch::stable::detail::to<bool>(stack[2]));
+  stack[0] = torch::stable::detail::from(res);
+}
+
+bool test_device_equality(torch::stable::Device d1, torch::stable::Device d2) {
+  return d1 == d2;
+}
+
+void boxed_test_device_equality(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
+  bool res = test_device_equality(
+      torch::stable::detail::to<torch::stable::Device>(stack[0]),
+      torch::stable::detail::to<torch::stable::Device>(stack[1]));
+  stack[0] = torch::stable::detail::from(res);
+}
+
+torch::stable::Device test_device_set_index(
+    torch::stable::Device device,
+    torch::stable::DeviceIndex index) {
+  device.set_index(index);
+  return device;
+}
+
+void boxed_test_device_set_index(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
+  torch::stable::Device res = test_device_set_index(
+      torch::stable::detail::to<torch::stable::Device>(stack[0]),
+      torch::stable::detail::to<torch::stable::DeviceIndex>(stack[1]));
+  stack[0] = torch::stable::detail::from(res);
+}
+
+torch::stable::DeviceIndex test_device_index(torch::stable::Device device) {
+  return device.index();
+}
+
+void boxed_test_device_index(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
+  torch::stable::DeviceIndex res = test_device_index(
+      torch::stable::detail::to<torch::stable::Device>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
+}
+
+bool test_device_is_cuda(torch::stable::Device device) {
+  return device.is_cuda();
+}
+
+void boxed_test_device_is_cuda(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
+  bool res = test_device_is_cuda(
+      torch::stable::detail::to<torch::stable::Device>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
+}
+
+bool test_device_is_cpu(torch::stable::Device device) {
+  return device.is_cpu();
+}
+
+void boxed_test_device_is_cpu(
+    StableIValue* stack,
+    uint64_t num_args,
+    uint64_t num_outputs) {
+  bool res = test_device_is_cpu(
+      torch::stable::detail::to<torch::stable::Device>(stack[0]));
+  stack[0] = torch::stable::detail::from(res);
+}
+
+STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
+  m.def("test_device_constructor(bool is_cuda, DeviceIndex index, bool use_str) -> Device");
+  m.def("test_device_equality(Device d1, Device d2) -> bool");
+  m.def("test_device_set_index(Device device, DeviceIndex index) -> Device");
+  m.def("test_device_index(Device device) -> DeviceIndex");
+  m.def("test_device_is_cuda(Device device) -> bool");
+  m.def("test_device_is_cpu(Device device) -> bool");
+}
+
+STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
+  m.impl("test_device_constructor", &boxed_test_device_constructor);
+  m.impl("test_device_equality", &boxed_test_device_equality);
+  m.impl("test_device_set_index", &boxed_test_device_set_index);
+  m.impl("test_device_index", &boxed_test_device_index);
+  m.impl("test_device_is_cuda", &boxed_test_device_is_cuda);
+  m.impl("test_device_is_cpu", &boxed_test_device_is_cpu);
+}
+
 // Test functions for torch::stable::accelerator APIs

 #ifdef LAE_USE_CUDA
@ -500,8 +676,8 @@ void boxed_test_device_guard(
    StableIValue* stack,
    uint64_t num_args,
    uint64_t num_outputs) {
-  int res = test_device_guard(static_cast<int64_t>(to<int64_t>(stack[0])));
-  stack[0] = from(res);
+  int res = test_device_guard(static_cast<int64_t>(torch::stable::detail::to<int64_t>(stack[0])));
+  stack[0] = torch::stable::detail::from(res);
 }

 int64_t test_device_guard_set_index() {
@ -520,7 +696,7 @@ void boxed_test_device_guard_set_index(
    uint64_t num_args,
    uint64_t num_outputs) {
  int64_t res = test_device_guard_set_index();
-  stack[0] = from(res);
+  stack[0] = torch::stable::detail::from(res);
 }

 int64_t test_stream(int32_t device_index) {
@ -536,8 +712,8 @@ void boxed_test_stream(
    StableIValue* stack,
    uint64_t num_args,
    uint64_t num_outputs) {
-  int64_t res = test_stream(static_cast<int64_t>(to<int64_t>(stack[0])));
-  stack[0] = from(res);
+  int64_t res = test_stream(static_cast<int64_t>(torch::stable::detail::to<int64_t>(stack[0])));
+  stack[0] = torch::stable::detail::from(res);
 }

 int64_t test_get_current_device_index() {
@ -549,7 +725,7 @@ void boxed_test_get_current_device_index(
    uint64_t num_args,
    uint64_t num_outputs) {
  int64_t res = test_get_current_device_index();
-  stack[0] = from(res);
+  stack[0] = torch::stable::detail::from(res);
 }

 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
@ -565,4 +741,5 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
  m.impl("test_stream", &boxed_test_stream);
  m.impl("test_get_current_device_index", &boxed_test_get_current_device_index);
 }
+
 #endif // LAE_USE_CUDA
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
@ -333,3 +333,123 @@ def my_new_zeros_dtype_variant(t) -> Tensor:
    Returns: New zeros tensor
    """
    return torch.ops.libtorch_agnostic.my_new_zeros_dtype_variant.default(t)
+
+
+def my__foreach_mul_(tensors, others) -> ():
+    """
+    Updates tensors to be the result of pointwise multiplying with others.
+
+    Args:
+        tensors: list of tensors
+        others: list of tensors (with the same corresponding shapes as tensors)
+
+    Returns: nothing, tensors is updated in place.
+    """
+    torch.ops.libtorch_agnostic.my__foreach_mul_.default(tensors, others)
+
+
+def my__foreach_mul(tensors, others) -> list[Tensor]:
+    """
+    Returns a list of tensors that are the results of pointwise multiplying
+    tensors and others.
+
+    Args:
+        tensors: list of tensors
+        others: list of tensors (with the same corresponding shapes as tensors)
+
+    Returns: list of multiplied tensors
+    """
+    return torch.ops.libtorch_agnostic.my__foreach_mul.default(tensors, others)
+
+
+def make_tensor_clones_and_call_foreach(t1, t2) -> list[Tensor]:
+    """
+    Returns a list of 2 tensors corresponding to the square of the inputs.
+
+    Args:
+        t1: Tensor
+        t2: Tensor
+
+    Returns: list of [t1^2, t2^2]
+    """
+    return torch.ops.libtorch_agnostic.make_tensor_clones_and_call_foreach.default(
+        t1, t2
+    )
+
+
+def test_device_constructor(is_cuda, index, use_str):
+    """
+    Tests creating a Device from DeviceType and index, or from a string.
+
+    Args:
+        is_cuda: bool - if True, creates CUDA device; if False, creates CPU device
+        index: int - device index
+        use_str: bool - if True, constructs from string; if False, constructs from DeviceType
+
+    Returns: Device - A device with the specified type and index
+    """
+    return torch.ops.libtorch_agnostic.test_device_constructor.default(
+        is_cuda, index, use_str
+    )
+
+
+def test_device_equality(d1, d2) -> bool:
+    """
+    Tests Device equality operator.
+
+    Args:
+        d1: Device - first device
+        d2: Device - second device
+
+    Returns: bool - True if devices are equal
+    """
+    return torch.ops.libtorch_agnostic.test_device_equality.default(d1, d2)
+
+
+def test_device_set_index(device, index):
+    """
+    Tests Device set_index() method.
+
+    Args:
+        device: Device - device to modify
+        index: int - new device index
+
+    Returns: Device - device with updated index
+    """
+    return torch.ops.libtorch_agnostic.test_device_set_index.default(device, index)
+
+
+def test_device_index(device) -> int:
+    """
+    Tests Device index() method.
+
+    Args:
+        device: Device - device to query
+
+    Returns: int - device index
+    """
+    return torch.ops.libtorch_agnostic.test_device_index.default(device)
+
+
+def test_device_is_cuda(device) -> bool:
+    """
+    Tests Device is_cuda() method.
+
+    Args:
+        device: Device - device to check
+
+    Returns: bool - True if device is CUDA
+    """
+    return torch.ops.libtorch_agnostic.test_device_is_cuda.default(device)
+
+
+def test_device_is_cpu(device) -> bool:
+    """
+    Tests Device is_cpu() method.
+
+    Args:
+        device: Device - device to check
+
+    Returns: bool - True if device is CPU
+    """
+    return torch.ops.libtorch_agnostic.test_device_is_cpu.default(device)
--- a/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
@ -367,6 +367,112 @@ if not IS_WINDOWS:
            self.assertNotEqual(result.data_ptr(), expected.data_ptr())
            self.assertEqual(result.stride(), expected.stride())

+        def test_my__foreach_mul_(self, device):
+            import libtorch_agnostic
+
+            N = 5
+            tensors = [torch.rand(32, 16, device=device) for _ in range(N)]
+            tensors_c = [t.clone() for t in tensors]
+            others = [torch.rand(32, 16, device=device) for _ in range(N)]
+
+            libtorch_agnostic.ops.my__foreach_mul_(tensors, others)
+            expected_values = torch._foreach_mul(tensors_c, others)
+
+            for tensor_t, expected_t in zip(tensors, expected_values):
+                self.assertEqual(tensor_t, expected_t)
+
+        def test_my__foreach_mul(self, device):
+            import libtorch_agnostic
+
+            N = 5
+            tensors = [torch.rand(32, 16, device=device) for _ in range(N)]
+            others = [torch.rand(32, 16, device=device) for _ in range(N)]
+
+            result = libtorch_agnostic.ops.my__foreach_mul(tensors, others)
+            expected = torch._foreach_mul(tensors, others)
+
+            for result_t, expected_t in zip(result, expected):
+                self.assertEqual(result_t, expected_t)
+
+            def _make_cuda_tensors(prior_mem):
+                cuda_res = libtorch_agnostic.ops.my__foreach_mul(tensors, others)
+                self.assertGreater(torch.cuda.memory_allocated(device), prior_mem)
+
+                expected = torch._foreach_mul(tensors, others)
+                for result_t, expected_t in zip(cuda_res, expected):
+                    self.assertEqual(result_t, expected_t)
+
+            if tensors[0].is_cuda:
+                init_mem = torch.cuda.memory_allocated(device)
+                for _ in range(3):
+                    _make_cuda_tensors(init_mem)
+                    curr_mem = torch.cuda.memory_allocated(device)
+                    self.assertEqual(curr_mem, init_mem)
+
+        def test_make_tensor_clones_and_call_foreach(self, device):
+            import libtorch_agnostic
+
+            t1 = torch.rand(2, 5, device=device)
+            t2 = torch.rand(3, 4, device=device)
+            result = libtorch_agnostic.ops.make_tensor_clones_and_call_foreach(t1, t2)
+            self.assertEqual(result[0], t1 * t1)
+            self.assertEqual(result[1], t2 * t2)
+
+        @onlyCUDA
+        def test_device(self, device):
+            import libtorch_agnostic
+
+            cuda_device = libtorch_agnostic.ops.test_device_constructor(
+                is_cuda=True, index=1, use_str=False
+            )
+            self.assertEqual(cuda_device, torch.device("cuda:1"))
+            cuda_device = libtorch_agnostic.ops.test_device_constructor(
+                is_cuda=True, index=1, use_str=True
+            )
+            self.assertEqual(cuda_device, torch.device("cuda:1"))
+
+            self.assertEqual(libtorch_agnostic.ops.test_device_index(cuda_device), 1)
+            self.assertTrue(
+                libtorch_agnostic.ops.test_device_equality(
+                    cuda_device, torch.device("cuda:1")
+                )
+            )
+            self.assertFalse(
+                libtorch_agnostic.ops.test_device_equality(
+                    cuda_device, torch.device("cuda:0")
+                )
+            )
+            self.assertFalse(libtorch_agnostic.ops.test_device_is_cpu(cuda_device))
+            self.assertTrue(libtorch_agnostic.ops.test_device_is_cuda(cuda_device))
+
+            cuda_0_device = libtorch_agnostic.ops.test_device_set_index(cuda_device, 0)
+            self.assertEqual(cuda_0_device, torch.device("cuda:0"))
+
+            cpu_device = libtorch_agnostic.ops.test_device_constructor(False, 0, False)
+            self.assertEqual(cpu_device, torch.device("cpu"))
+            self.assertTrue(
+                libtorch_agnostic.ops.test_device_equality(
+                    cpu_device, torch.device("cpu")
+                )
+            )
+            self.assertTrue(libtorch_agnostic.ops.test_device_is_cpu(cpu_device))
+            self.assertFalse(libtorch_agnostic.ops.test_device_is_cuda(cpu_device))
+            self.assertFalse(
+                libtorch_agnostic.ops.test_device_equality(cpu_device, cuda_device)
+            )
+
+            with self.assertRaisesRegex(
+                RuntimeError, "Device index 129 is out of range for int8_t"
+            ):
+                libtorch_agnostic.ops.test_device_constructor(
+                    is_cuda=True, index=129, use_str=False
+                )
+
+            with self.assertRaisesRegex(
+                RuntimeError, "Device index 129 is out of range for int8_t"
+            ):
+                libtorch_agnostic.ops.test_device_set_index(cuda_device, 129)
+
    instantiate_device_type_tests(TestLibtorchAgnostic, globals(), except_for=None)

 if __name__ == "__main__":
--- a/test/custom_backend/test_custom_backend.py
+++ b/test/custom_backend/test_custom_backend.py
@ -1,6 +1,5 @@
 # Owner(s): ["module: unknown"]

-import os
 import tempfile

 from backend import get_custom_backend_library_path, Model, to_custom_backend
@ -41,14 +40,11 @@ class TestCustomBackend(TestCase):
        self.test_execute()

        # Save and load.
-        f = tempfile.NamedTemporaryFile(delete=False)
-        try:
+        with tempfile.NamedTemporaryFile() as f:
            f.close()
            torch.jit.save(self.model, f.name)
            loaded = torch.jit.load(f.name)
-        finally:
-            os.unlink(f.name)
-        self.model = loaded
+            self.model = loaded

        # Test execution again.
        self.test_execute()
--- a/test/custom_operator/test_custom_ops.py
+++ b/test/custom_operator/test_custom_ops.py
@ -1,6 +1,5 @@
 # Owner(s): ["module: unknown"]

-import os.path
 import sys
 import tempfile
 import unittest
@ -144,16 +143,13 @@ def forward(self, arg0_1):
        # Ideally we would like to not have to manually delete the file, but NamedTemporaryFile
        # opens the file, and it cannot be opened multiple times in Windows. To support Windows,
        # close the file after creation and try to remove it manually.
-        file = tempfile.NamedTemporaryFile(delete=False)
-        try:
+        with tempfile.NamedTemporaryFile() as file:
            file.close()
            model.save(file.name)
            loaded = torch.jit.load(file.name)
-        finally:
-            os.unlink(file.name)

-        output = loaded.forward(torch.ones(5))
-        self.assertTrue(output.allclose(torch.ones(5) + 1))
+            output = loaded.forward(torch.ones(5))
+            self.assertTrue(output.allclose(torch.ones(5) + 1))


 if __name__ == "__main__":
--- a/test/distributed/_composable/fsdp/test_fully_shard_logging.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_logging.py
@ -1,7 +1,7 @@
 # Owner(s): ["module: fsdp"]
 import functools
 import os
-import unittest.mock
+import unittest

 import torch.distributed as dist
 from torch._dynamo.test_case import run_tests
@ -37,9 +37,9 @@ import torch
 import torch.distributed as dist
 import torch.nn as nn
 from torch.distributed.fsdp import fully_shard
-logger = logging.getLogger("torch.distributed._composable.fsdp")
+logger = logging.getLogger("torch.distributed.fsdp.fully_shard")
 logger.setLevel(logging.DEBUG)
-device = {device_type.type}
+device = '{device_type.type}'
 torch.manual_seed(0)
 model = nn.Sequential(*[nn.Linear(4, 4, device=device, bias=False) for _ in range(2)])
 for layer in model:
--- a/test/distributed/_composable/test_replicate_with_fsdp.py
+++ b/test/distributed/_composable/test_replicate_with_fsdp.py
@ -76,7 +76,7 @@ class ReplicateTest(MultiProcessTestCase):
            store=dist.FileStore(self.file_name, self.world_size),
        )

-    @skip_if_lt_x_gpu(2)
+    @skip_if_lt_x_gpu(4)
    def test_replicate_transformer(self):
        """
        This tests that replicate works on a transformer model with fully_shard and replicate layers
@ -126,7 +126,7 @@ class ReplicateTest(MultiProcessTestCase):
                for parameter in layer.parameters():
                    self.assertEqual(parameter.placements, (Shard(dim=0),))

-    @skip_if_lt_x_gpu(2)
+    @skip_if_lt_x_gpu(4)
    def test_replicate_transformer_managed_modules(self):
        """
        This tests that replicate managed modules works properly. In this test we use a Transformer Module with 3 layers,
@ -178,7 +178,7 @@ class ReplicateTest(MultiProcessTestCase):
        replicate_model = replicate(replicate_model)
        self.assertEqual(len(_get_managed_modules((replicate_model,))), 21)

-    @skip_if_lt_x_gpu(2)
+    @skip_if_lt_x_gpu(4)
    def test_replicate_tp_device_mesh(self):
        """
        This tests that a user can pass in a device mesh to replicate a module
@ -206,7 +206,7 @@ class ReplicateTest(MultiProcessTestCase):
                self.assertEqual(parameter.device_mesh.shape, (2,))
                self.assertEqual(parameter.placements, (Replicate(),))

-    @skip_if_lt_x_gpu(2)
+    @skip_if_lt_x_gpu(4)
    def test_train_replicate_fsdp(self):
        """
        Tests that replicate_model has the same behavior as original model when training
@ -253,7 +253,7 @@ class ReplicateTest(MultiProcessTestCase):
            self.assertEqual(replicate_loss, loss)
            check_sharded_parity(self, model, replicate_model)

-    @skip_if_lt_x_gpu(2)
+    @skip_if_lt_x_gpu(4)
    def test_train_parity_2d_mlp(self):
        """
        Verifies when a device mesh is passed in, the model has the same behavior as the original model when training
--- a/test/distributed/_tools/test_sac_ilp.py
+++ b/test/distributed/_tools/test_sac_ilp.py
@ -80,7 +80,7 @@ class TestSACILP(TestCase):
            # postprocessing due to the fact that for ModTracker, the post backward hook
            # is not being called for modules whose inputs don't require gradients
            # TODO: fix this in ModTracker and ensure it does not lead to any perf regression
-            if _ModState.POST_BW not in mod_stats.snapshots.keys():
+            if _ModState.POST_BW not in mod_stats.snapshots:
                mod_stats.snapshots.setdefault(_ModState.POST_BW, []).append(
                    copy.deepcopy(last_snapshot)
                )
--- a/test/distributed/argparse_util_test.py
+++ b/test/distributed/argparse_util_test.py
@ -16,7 +16,7 @@ from torch.distributed.argparse_util import check_env, env
 class ArgParseUtilTest(unittest.TestCase):
    def setUp(self):
        # remove any lingering environment variables
-        for e in os.environ.keys():
+        for e in os.environ.keys():  # noqa: SIM118
            if e.startswith("PET_"):
                del os.environ[e]

--- a/test/distributed/checkpoint/_experimental/test_staging.py
+++ b/test/distributed/checkpoint/_experimental/test_staging.py
@ -207,7 +207,7 @@ class TestDefaultStager(TestCase):
        for i, result in enumerate(staged_results):
            self.assertIsInstance(result, dict)
            # Verify the result contains the expected keys
-            for key in state_dicts[i].keys():
+            for key in state_dicts[i]:
                self.assertIn(key, result)

        stager.close()
--- a/test/distributed/checkpoint/test_dtensor_resharding.py
+++ b/test/distributed/checkpoint/test_dtensor_resharding.py
@ -299,7 +299,7 @@ class TestDTensorReshardMeshChange(DTensorTestBase):

    @with_comms
    @with_temp_dir
-    @skip_if_lt_x_gpu(2)
+    @skip_if_lt_x_gpu(4)
    def test_dtensor_checkpoint_with_uneven_shards(self) -> None:
        """
        Saving a dtensor with uneven shards.
@ -436,6 +436,7 @@ class TestCheckpointableReshard(DTensorTestBase):

    @with_comms
    @with_temp_dir
+    @skip_if_lt_x_gpu(4)
    def test_uneven_reshard_with_checkpointable_api(self) -> None:
        """
        Saves a 1d distributed tensor that has shards with uneven sizes using Checkpointable API.
@ -498,6 +499,7 @@ class TestCheckpointableReshard(DTensorTestBase):

    @with_comms
    @with_temp_dir
+    @skip_if_lt_x_gpu(4)
    def test_uneven_reshard_with_dtensor_shards_wrapper_api(self) -> None:
        """
        Saves a 1d distributed tensor that has shards with uneven sizes using Checkpointable API.
--- a/test/distributed/checkpoint/test_hf_safetensor_e2e.py
+++ b/test/distributed/checkpoint/test_hf_safetensor_e2e.py
@ -60,7 +60,7 @@ class TestSingleRankSaveLoad(TestCase):
        self.assertEqual(
            sorted(state_dict_to_save.keys()), sorted(state_dict_loaded.keys())
        )
-        for key in state_dict_to_save.keys():
+        for key in state_dict_to_save:
            self.assertTrue(
                torch.equal(state_dict_to_save[key], state_dict_loaded[key])
            )
@ -89,7 +89,7 @@ class TestSingleRankSaveLoad(TestCase):
        self.assertEqual(
            sorted(state_dict_to_save.keys()), sorted(state_dict_to_load.keys())
        )
-        for key in state_dict_to_save.keys():
+        for key in state_dict_to_save:
            self.assertTrue(
                torch.equal(state_dict_to_save[key], state_dict_to_load[key])
            )
@ -116,7 +116,7 @@ class TestSingleRankSaveLoad(TestCase):
        self.assertEqual(
            sorted(state_dict_to_save.keys()), sorted(state_dict_loaded.keys())
        )
-        for key in state_dict_to_save.keys():
+        for key in state_dict_to_save:
            self.assertTrue(
                torch.equal(state_dict_to_save[key], state_dict_loaded[key])
            )
@ -156,7 +156,7 @@ class TestSingleRankSaveLoad(TestCase):
        self.assertEqual(
            sorted(state_dict_to_save.keys()), sorted(state_dict_to_load.keys())
        )
-        for key in state_dict_to_save.keys():
+        for key in state_dict_to_save:
            self.assertTrue(
                torch.equal(state_dict_to_save[key], state_dict_to_load[key])
            )
--- a/test/distributed/checkpoint/test_planner.py
+++ b/test/distributed/checkpoint/test_planner.py
@ -18,6 +18,7 @@ from torch.distributed.checkpoint._dedup_save_plans import dedup_save_plans
 from torch.distributed.checkpoint.api import CheckpointException
 from torch.distributed.checkpoint.default_planner import (
    _create_default_local_metadata,
+    _validate_global_plan,
    create_default_global_save_plan,
    create_default_local_load_plan,
    create_default_local_save_plan,
@ -28,6 +29,7 @@ from torch.distributed.checkpoint.filesystem import CURRENT_DCP_VERSION
 from torch.distributed.checkpoint.metadata import (
    BytesStorageMetadata,
    ChunkStorageMetadata,
+    Metadata,
    MetadataIndex,
    TensorProperties,
    TensorStorageMetadata,
@ -560,6 +562,32 @@ class TestPlannerHelpers(TestCase):
        self.assertTrue(_compare_save_plans(plan2, plan2))


+class TestValidateGlobalPlan(TestCase):
+    def _make_metadata(self, chunks, size):
+        storage = TensorStorageMetadata(
+            properties=TensorProperties(dtype=torch.float32),
+            size=torch.Size(size),
+            chunks=chunks,
+        )
+        return Metadata(state_dict_metadata={"param": storage})
+
+    def test_non_overlapping_chunks(self):
+        chunks = [
+            ChunkStorageMetadata(offsets=torch.Size([i]), sizes=torch.Size([1]))
+            for i in range(4)
+        ]
+        metadata = self._make_metadata(chunks, [4])
+        self.assertTrue(_validate_global_plan([SavePlan([])], metadata))
+
+    def test_detect_overlapping_chunks(self):
+        chunks = [
+            ChunkStorageMetadata(offsets=torch.Size([0]), sizes=torch.Size([2])),
+            ChunkStorageMetadata(offsets=torch.Size([1]), sizes=torch.Size([2])),
+        ]
+        metadata = self._make_metadata(chunks, [4])
+        self.assertFalse(_validate_global_plan([SavePlan([])], metadata))
+
+
 class TestLoadPlanner(TestCase):
    @with_temp_dir
    def test_strict(self):
--- a/test/distributed/checkpoint/test_state_dict.py
+++ b/test/distributed/checkpoint/test_state_dict.py
@ -769,7 +769,7 @@ class TestStateDict(DTensorTestBase, VerifyStateDictMixin):
        model_state_dict3 = copy.deepcopy(model_state_dict3)
        self.assertEqual(len(model_state_dict2), 2)
        self.assertEqual(len(model_state_dict3), 2)
-        for key in model_state_dict3.keys():
+        for key in model_state_dict3:
            full_fqn = f"l.{key}"
            value1 = model_state_dict1[full_fqn]
            value2 = model_state_dict2[full_fqn]
@ -886,7 +886,7 @@ class TestStateDict(DTensorTestBase, VerifyStateDictMixin):
            self.assertEqual(cpu_model_value, meta_model_value)

    @with_comms
-    @skip_if_lt_x_gpu(2)
+    @skip_if_lt_x_gpu(4)
    def test_setting_meta_device_model_broadcasting_and_memory(self) -> None:
        # This test verifies that we can set model state dict by a meta device model
        # With the correlated changes in state_dict, meta device model should be accepted
--- a/test/distributed/fsdp/test_fsdp_state_dict.py
+++ b/test/distributed/fsdp/test_fsdp_state_dict.py
@ -587,9 +587,7 @@ class TestFSDPStateDict(FSDPTest):
                    model, cpu_offload.offload_params, fp16
                )

-            ignore_keys = [
-                k for k in fsdp_state_dict.keys() if NON_ROOT_FSDP_PREFIX in k
-            ]
+            ignore_keys = [k for k in fsdp_state_dict if NON_ROOT_FSDP_PREFIX in k]

            self._validate_state_dict_contents(
                model,
@ -910,7 +908,7 @@ class TestFSDPStateDict(FSDPTest):
        with sd_mgr:
            fsdp_state_dict = model.state_dict()

-        ignore_keys = [k for k in fsdp_state_dict.keys() if NON_ROOT_FSDP_PREFIX in k]
+        ignore_keys = [k for k in fsdp_state_dict if NON_ROOT_FSDP_PREFIX in k]
        self._validate_state_dict_contents(
            model,
            fsdp_state_dict,
@ -959,9 +957,7 @@ class TestFSDPStateDict(FSDPTest):
                # Full name of linear_skip param tensors in SkipModel, as would be
                # stored in checkpoint.
                linear_skip_tensor_names = [
-                    k
-                    for k in dict(module.named_parameters()).keys()
-                    if LINEAR_SKIP in k
+                    k for k in dict(module.named_parameters()) if LINEAR_SKIP in k
                ]
                # skip SkipModule
                linear_skip = getattr(module, LINEAR_SKIP)
--- a/test/distributed/launcher/api_test.py
+++ b/test/distributed/launcher/api_test.py
@ -137,7 +137,7 @@ class ElasticLaunchTest(unittest.TestCase):
        self.test_dir = tempfile.mkdtemp()

        # remove any lingering environment variables.
-        for env in os.environ.keys():
+        for env in os.environ.keys():  # noqa: SIM118
            if env.startswith("PET_"):
                del os.environ[env]

--- a/test/distributed/launcher/script_deviceid.py
+++ b/test/distributed/launcher/script_deviceid.py
@ -0,0 +1,44 @@
+# Owner(s): ["oncall: r2p"]
+
+# This is a helper script for
+# test_run.py::ElasticLaunchTest::test_virtual_local_rank. It prints out the
+# generated inductor output for a simple function.
+
+import os
+from unittest.mock import patch
+
+import torch
+import torch.distributed as dist
+from torch._inductor import codecache
+
+
+@torch.compile
+def myfn(x: torch.Tensor) -> torch.Tensor:
+    return x + x
+
+
+dist.init_process_group(backend="nccl")
+
+local_rank = int(os.environ.get("LOCAL_RANK", "cuda:0"))
+torch.cuda.set_device(local_rank)
+
+
+def print_output_code(original_fn):
+    def wrapper(msg, *args, **kwargs):
+        # Check if this is the "Output code:" message
+        if args and "Output code:" in msg:
+            print(args[0])
+
+    return wrapper
+
+
+x = torch.rand(2, 2, device="cuda")
+
+with patch.object(
+    codecache.output_code_log,
+    "debug",
+    side_effect=print_output_code(codecache.output_code_log.debug),
+):
+    y = myfn(x)
+
+dist.destroy_process_group()
--- a/test/distributed/launcher/test_run.py
+++ b/test/distributed/launcher/test_run.py
@ -16,7 +16,7 @@ import sys
 import tempfile
 import uuid
 from contextlib import closing, redirect_stderr, redirect_stdout
-from unittest import mock
+from unittest import mock, skipIf
 from unittest.mock import MagicMock, Mock, patch

 import torch.distributed.run as launch
@ -28,6 +28,7 @@ from torch.distributed.elastic.utils.distributed import get_free_port
 from torch.testing._internal.common_utils import (
    run_tests,
    skip_but_pass_in_sandcastle_if,
+    TEST_CUDA,
    TEST_WITH_DEV_DBG_ASAN,
    TestCase,
 )
@ -69,7 +70,7 @@ class ElasticLaunchTest(TestCase):
        self.test_dir = tempfile.mkdtemp()

        # remove any lingering environment variables
-        for env in os.environ.keys():
+        for env in os.environ.keys():  # noqa: SIM118
            if env.startswith("PET_"):
                del os.environ[env]

@ -677,6 +678,96 @@ class ElasticLaunchTest(TestCase):
        for i in range(nproc_per_node):
            self.assertTrue(f"[rank{i}]: creating " in captured_out.getvalue())

+    @skip_but_pass_in_sandcastle_if(
+        TEST_WITH_DEV_DBG_ASAN, "test incompatible with dev/dbg asan"
+    )
+    @skipIf(not TEST_CUDA, "requires CUDA")
+    def test_virtual_local_rank(self):
+        """
+        Test that virtual-local-rank ensures consistent device IDs across ranks.
+        Without it, ranks may compile to different devices, leading to different code.
+        """
+        run_id = str(uuid.uuid4().int)
+        nnodes = 1
+        nproc_per_node = 2
+
+        # Helper function to run and capture output
+        def run_test(use_virtual_local_rank):
+            args = [
+                f"--nnodes={nnodes}",
+                f"--nproc-per-node={nproc_per_node}",
+                f"--rdzv-id={run_id}",
+                "--monitor-interval=1",
+                "--start-method=spawn",
+                "--redirect=3",
+                "--tee=3",
+            ]
+            if use_virtual_local_rank:
+                args.append("--virtual-local-rank")
+
+            args.append(path("script_deviceid.py"))
+
+            captured_out = io.StringIO()
+            captured_err = io.StringIO()
+            with redirect_stdout(captured_out), redirect_stderr(captured_err):
+                launch.main(args)
+
+            return captured_out.getvalue()
+
+        def split_ranks(output):
+            default0 = []
+            default1 = []
+            for line in output.splitlines():
+                if "cuda:" not in line:
+                    continue
+                if line.startswith("[default0]:"):
+                    default0.append(line[11:])
+                elif line.startswith("[default1]:"):
+                    default1.append(line[11:])
+            return default0, default1
+
+        # First, run WITHOUT virtual-local-rank - outputs should differ
+        output = run_test(use_virtual_local_rank=False)
+        rank0, rank1 = split_ranks(output)
+
+        # Verify we actually captured compiled code from both ranks
+        self.assertGreater(
+            len(rank0), 0, "Expected to capture compiled code from rank 0"
+        )
+        self.assertGreater(
+            len(rank1), 0, "Expected to capture compiled code from rank 1"
+        )
+
+        # Without virtual-local-rank, the ranks should have DIFFERENT compiled code
+        # because they see different device IDs (cuda:0 vs cuda:1)
+        self.assertNotEqual(
+            rank0,
+            rank1,
+            "Expected different compiled code without --virtual-local-rank",
+        )
+
+        # Now run WITH virtual-local-rank - outputs should be identical
+        output = run_test(use_virtual_local_rank=True)
+        rank0, rank1 = split_ranks(output)
+
+        # Verify we actually captured compiled code from both ranks
+        self.assertGreater(
+            len(rank0),
+            0,
+            "Expected to capture compiled code from rank 0 with --virtual-local-rank",
+        )
+        self.assertGreater(
+            len(rank1),
+            0,
+            "Expected to capture compiled code from rank 1 with --virtual-local-rank",
+        )
+
+        # With virtual-local-rank, both ranks should have IDENTICAL compiled code
+        # because they both see cuda:0 during compilation
+        self.assertEqual(
+            rank0, rank1, "Expected identical compiled code with --virtual-local-rank"
+        )
+

 if __name__ == "__main__":
    run_tests()
--- a/test/distributed/pipelining/test_schedule_multiproc.py
+++ b/test/distributed/pipelining/test_schedule_multiproc.py
@ -39,6 +39,7 @@ from torch.nn.modules.loss import MSELoss
 from torch.testing._internal.common_distributed import (
    MultiProcContinuousTest,
    requires_accelerator_dist_backend,
+    skip_if_lt_x_gpu,
 )
 from torch.testing._internal.common_utils import (
    check_leaked_tensors,
@ -231,6 +232,7 @@ class ScheduleTest(MultiProcContinuousTest):
        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
    )
    @parametrize("ScheduleClass", [_ScheduleForwardOnly])
+    @skip_if_lt_x_gpu(4)
    def test_forward_only(self, ScheduleClass):
        mod, mod_ref, x, _, _ = setup_models_and_data(self.config)
        x_clone = x.clone()
@ -274,6 +276,7 @@ class ScheduleTest(MultiProcContinuousTest):
            ScheduleInterleavedZeroBubble,
        ],
    )
+    @skip_if_lt_x_gpu(4)
    def test_eval_inference_mode(self, ScheduleClass):
        num_microbatches = 4
        if ScheduleClass in [
@ -351,6 +354,7 @@ class ScheduleTest(MultiProcContinuousTest):
            ScheduleInterleavedZeroBubble,
        ],
    )
+    @skip_if_lt_x_gpu(4)
    def test_return_output(self, ScheduleClass):
        num_microbatches = 4
        if ScheduleClass in [
@ -406,6 +410,7 @@ class ScheduleTest(MultiProcContinuousTest):
        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
    )
    @parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
+    @skip_if_lt_x_gpu(4)
    def test_multi_iter(self, ScheduleClass):
        mod, _, x, target, loss_fn = setup_models_and_data(self.config)
        chunks = 4
@ -429,6 +434,7 @@ class ScheduleTest(MultiProcContinuousTest):
        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
    )
    @parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
+    @skip_if_lt_x_gpu(4)
    def test_kwargs_with_tracer(self, ScheduleClass):
        mod = ModelWithKwargs(d_hid, splits=self.world_size)
        mod.to(self.device)
@ -481,6 +487,7 @@ class ScheduleTest(MultiProcContinuousTest):
        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
    )
    @parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
+    @skip_if_lt_x_gpu(4)
    def test_grad_with_tracer(self, ScheduleClass):
        mod, ref_mod, x, target, loss_fn = setup_models_and_data(self.config)

@ -523,6 +530,7 @@ class ScheduleTest(MultiProcContinuousTest):
    )
    @parametrize("ScheduleClass", [ScheduleGPipe, Schedule1F1B])
    @parametrize("shape_inference", [True, False])
+    @skip_if_lt_x_gpu(4)
    def test_grad_with_manual(self, ScheduleClass, shape_inference):
        mod, ref_mod, x, target, loss_fn = setup_models_and_data(self.config)

@ -586,6 +594,7 @@ class ScheduleTest(MultiProcContinuousTest):
            ScheduleInterleavedZeroBubble,
        ],
    )
+    @skip_if_lt_x_gpu(4)
    def test_grad_with_manual_interleaved(self, ScheduleClass):
        stages_per_rank = 2
        n_stages = stages_per_rank * self.world_size
@ -650,6 +659,7 @@ class ScheduleTest(MultiProcContinuousTest):
        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
    )
    @parametrize("ScheduleClass", [ScheduleInterleavedZeroBubble])
+    @skip_if_lt_x_gpu(4)
    def test_schedule_with_weight_update_mlp_e2e(self, ScheduleClass):
        stages_per_rank = 2
        n_stages = stages_per_rank * self.world_size
@ -736,6 +746,7 @@ class ScheduleTest(MultiProcContinuousTest):
        "schedule_class",
        [ScheduleZBVZeroBubble, ScheduleDualPipeV],
    )
+    @skip_if_lt_x_gpu(4)
    def test_v_shape_schedules(self, schedule_class):
        n_stages = 8
        rank_stages = {0: [0, 7], 1: [1, 6], 2: [2, 5], 3: [3, 4]}
@ -780,6 +791,7 @@ class ScheduleTest(MultiProcContinuousTest):
    @skip_but_pass_in_sandcastle_if(
        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
    )
+    @skip_if_lt_x_gpu(4)
    def test_custom_function_callback(self):
        """Test the custom function callback functionality with _PipelineScheduleRuntime."""
        n_stages = 8
@ -979,6 +991,7 @@ class ScheduleTest(MultiProcContinuousTest):
        "ScheduleClass",
        [ScheduleInterleavedZeroBubble, ScheduleInterleaved1F1B],
    )
+    @skip_if_lt_x_gpu(4)
    def test_zero_bubble_with_model_kwargs(self, ScheduleClass):
        stages_per_rank = 2
        n_stages = stages_per_rank * self.world_size
@ -1072,6 +1085,7 @@ class CustomSchedulesTest(MultiProcContinuousTest):
        "schedule_class",
        [ScheduleVShaped, ScheduleUnbalanced],
    )
+    @skip_if_lt_x_gpu(4)
    def test_non_symmetric_stage_ids(self, schedule_class):
        n_stages = schedule_class.n_stages
        rank_stages = schedule_class.rank_stages
@ -1121,6 +1135,7 @@ class CustomSchedulesTest(MultiProcContinuousTest):
        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
    )
    @parametrize("ScheduleClass", [ScheduleWithReorderedB])
+    @skip_if_lt_x_gpu(4)
    def test_pipeline_schedule_runtime_custom_sched(self, ScheduleClass):
        n_stages = 2
        stages_per_rank = 1
@ -1181,6 +1196,7 @@ class CustomSchedulesTest(MultiProcContinuousTest):
        not TEST_MULTIACCELERATOR, f"{backend} test requires 2+ GPUs"
    )
    @parametrize("ScheduleClass", [ScheduleWithW])
+    @skip_if_lt_x_gpu(4)
    def test_schedule_with_native_zero_bubble(self, ScheduleClass):
        n_stages = ScheduleClass.n_stages
        num_microbatches = ScheduleClass.num_microbatches
--- a/test/distributed/tensor/test_convolution_ops.py
+++ b/test/distributed/tensor/test_convolution_ops.py
@ -204,14 +204,16 @@ class DistConvolutionOpsTest(DTensorTestBase):
        self.assertTrue(b_dt.grad is not None)
        self.assertTrue(x_dt.grad is None)

-    def _run_single_arg_fwd(self, model, arg) -> tuple[torch.Tensor, torch.Tensor]:
+    def _run_single_arg_fwd(
+        self, model, arg, placements=None
+    ) -> tuple[torch.Tensor, torch.Tensor]:
        """Given model and arg, runs fwd model local and distbuted given device_mesh"""
        device_mesh = self.build_device_mesh()
        model_copy = copy.deepcopy(model).to(device=self.device_type)
        dist_model = distribute_module(model, device_mesh, _conv_fn)
-        arg_dt = DTensor.from_local(arg, device_mesh, [Replicate()])
+        arg_dt = DTensor.from_local(arg, device_mesh, placements)
        out_dt = dist_model(arg_dt.to(device=self.device_type))
-        out = model_copy(arg)
+        out = model_copy(arg_dt.full_tensor())
        return (out_dt.full_tensor(), out)

    @with_comms
@ -219,22 +221,20 @@ class DistConvolutionOpsTest(DTensorTestBase):
        model = nn.Conv1d(64, 64, 3, padding=1)
        x = torch.randn(1, 64, 8, device=self.device_type)
        out_dt, out = self._run_single_arg_fwd(model, x)
-        self.assertEqual(out_dt.shape, out.shape)
+        self.assertEqual(out_dt, out)

    @with_comms
    def test_conv3d(self):
        model = nn.Conv3d(64, 64, 3, padding=1)
        x = torch.randn(1, 64, 8, 8, 8, device=self.device_type)
-        out_dt, out = self._run_single_arg_fwd(model, x)
-        self.assertEqual(out_dt.shape, out.shape)
+        out_dt, out = self._run_single_arg_fwd(model, x, [Shard(0)])
+        self.assertEqual(out_dt, out)


 DistConvolutionOpsTestWithLocalTensor = create_local_tensor_test_class(
    DistConvolutionOpsTest,
    # Send / recv ops are not supported
    skipped_tests=[
-        "test_conv1d",
-        "test_conv3d",
        "test_conv_backward_none_grad_inp",
        "test_depthwise_convolution",
        "test_downsampling_convolution",
--- a/test/distributed/tensor/test_dtensor_compile.py
+++ b/test/distributed/tensor/test_dtensor_compile.py
@ -464,6 +464,25 @@ def forward(self, b_parametrizations_buffer_original0, x):
        run(g, 64, 8)
        self.assertEqual(cnt.frame_count, 2)

+    def test_dtensor_requires_grad_recompile(self):
+        cnt = torch._dynamo.testing.CompileCounterWithBackend("aot_eager")
+        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
+
+        @torch.compile(backend=cnt, fullgraph=True)
+        def f(x):
+            y = x * x
+            return y.to_local()
+
+        full_x = torch.randn(8, 8, requires_grad=False)
+        x = distribute_tensor(full_x, mesh, [Shard(0)])
+        f(x)
+
+        full_x = torch.randn(8, 8, requires_grad=True)
+        x = distribute_tensor(full_x, mesh, [Shard(0)])
+        f(x)
+
+        self.assertEqual(cnt.frame_count, 2)
+
    def test_dtensor_attribute_access_on_intermediate(self):
        mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))

--- a/test/distributed/tensor/test_dtensor_export.py
+++ b/test/distributed/tensor/test_dtensor_export.py
@ -535,6 +535,19 @@ class DTensorExportTest(TestCase):

        self.assertEqual(fn(z), gm(z)[0])

+    def test_dtensor_data_dependent_index(self):
+        device_mesh = init_device_mesh(self.device_type, mesh_shape=(self.world_size,))
+
+        class Foo(torch.nn.Module):
+            def forward(self, x, y):
+                return x[y]
+
+        x = torch.randn(10)
+        y = torch.randint(1, (10,)).bool()
+        x_dt = distribute_tensor(x, device_mesh, placements=[Replicate()])
+        y_dt = distribute_tensor(y, device_mesh, placements=[Replicate()])
+        _dynamo_graph_capture_for_export(Foo())(x_dt, y_dt)
+

 instantiate_parametrized_tests(DTensorExportTest)

--- a/test/distributed/tensor/test_math_ops.py
+++ b/test/distributed/tensor/test_math_ops.py
@ -26,6 +26,7 @@ from torch.distributed.tensor.parallel import (
    RowwiseParallel,
    SequenceParallel,
 )
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
    create_local_tensor_test_class,
@ -764,6 +765,7 @@ class DistMathOpsTest(DTensorTestBase):
        self.assertEqual(grad1_norm.device_mesh, mesh_y)

    @with_comms
+    @skip_if_lt_x_gpu(4)
    def test_foreach_add_different_mesh(self):
        mesh_shape = (2, self.world_size // 2)
        mesh_2d = init_device_mesh(
--- a/test/distributed/tensor/test_op_strategy.py
+++ b/test/distributed/tensor/test_op_strategy.py
@ -577,7 +577,7 @@ class DistTensorReplicateStrategyRegistrationTest(DTensorTestBase):
                self.assertEqual(
                    comm_mode.get_comm_counts(),
                    {
-                        torch.ops.c10d_functional.all_gather_into_tensor: 4,
+                        torch.ops.c10d_functional.all_gather_into_tensor: self.world_size,
                    },
                )
                expected_cost = [
--- a/test/distributed/tensor/test_redistribute.py
+++ b/test/distributed/tensor/test_redistribute.py
@ -2,7 +2,6 @@
 # Owner(s): ["oncall: distributed"]

 import contextlib
-import copy
 import itertools
 import unittest

@ -22,9 +21,8 @@ from torch.distributed.tensor import (
 )
 from torch.distributed.tensor._collective_utils import shard_dim_alltoall
 from torch.distributed.tensor._dtensor_spec import ShardOrderEntry
-from torch.distributed.tensor._redistribute import redistribute_local_tensor
 from torch.distributed.tensor.debug import CommDebugMode
-from torch.distributed.tensor.placement_types import _StridedShard
+from torch.distributed.tensor.placement_types import _StridedShard, MaskPartial
 from torch.testing._internal.common_utils import (
    instantiate_parametrized_tests,
    parametrize,
@ -35,7 +33,11 @@ from torch.testing._internal.common_utils import (
 from torch.testing._internal.distributed._tensor.common_dtensor import (
    create_local_tensor_test_class,
    DTensorTestBase,
+    generate_shard_orders,
+    make_full_tensor,
    map_local_tensor_for_rank,
+    patched_distribute_tensor as _distribute_tensor,
+    redistribute,
    with_comms,
 )
 from torch.utils._debug_mode import DebugMode
@ -785,88 +787,6 @@ class DistributeWithDeviceOrderTest(DTensorTestBase):
        else:
            return ""

-    # TODO(zpcore): remove once the native redistribute supports shard_order arg
-    def redistribute(
-        self,
-        dtensor_input,
-        device_mesh,
-        placements,
-        shard_order,
-        use_graph_based_transform=True,
-    ):
-        """
-        wrapper function to support shard_order for redistribution
-        This is a simpler version of Redistribute, only considers the forward.
-        """
-        if placements is None:
-            placements = self._shard_order_to_placement(shard_order, device_mesh)
-        placements = tuple(placements)
-        old_spec = dtensor_input._spec
-        new_spec = copy.deepcopy(old_spec)
-        new_spec.placements = placements
-        if shard_order is not None:
-            new_spec.shard_order = shard_order
-        else:
-            new_spec.shard_order = ()
-        if old_spec == new_spec:
-            return dtensor_input
-        dtensor_input = DTensor.from_local(
-            redistribute_local_tensor(
-                dtensor_input.to_local(),
-                old_spec,
-                new_spec,
-                use_graph_based_transform=use_graph_based_transform,
-            ),
-            device_mesh,
-        )
-        dtensor_input._spec = copy.deepcopy(new_spec)
-        return dtensor_input  # returns DTensor
-
-    # TODO(zpcore): remove once the native distribute_tensor supports
-    # shard_order arg
-    def distribute_tensor(
-        self,
-        input_tensor,
-        device_mesh,
-        placements,
-        shard_order,
-        use_graph_based_transform=True,
-    ):
-        """wrapper function to support shard_order for tensor distribution"""
-        if placements is None:
-            placements = self._shard_order_to_placement(shard_order, device_mesh)
-        placements = tuple(placements)
-        tensor_dt = distribute_tensor(input_tensor, device_mesh, placements)
-        # fix the shard order
-        return self.redistribute(
-            tensor_dt, device_mesh, placements, shard_order, use_graph_based_transform
-        )
-
-    # TODO(zpcore): remove once the native redistribute supports shard_order arg
-    def full_tensor(self, dtensor_input):
-        """wrapper function to support DTensor.full_tensor"""
-        return self.redistribute(
-            dtensor_input, dtensor_input.device_mesh, placements=None, shard_order=()
-        ).to_local()
-
-    def _shard_order_to_placement(self, shard_order, mesh):
-        """convert shard_order to placement with only Replicate() and Shard()"""
-        placements = [Replicate() for _ in range(mesh.ndim)]
-        if shard_order is not None:
-            for entry in shard_order:
-                tensor_dim = entry.tensor_dim
-                mesh_dims = entry.mesh_dims
-                for mesh_dim in mesh_dims:
-                    placements[mesh_dim] = Shard(tensor_dim)
-        return tuple(placements)
-
-    def _convert_shard_order_dict_to_ShardOrder(self, shard_order):
-        """Convert shard_order dict to ShardOrder"""
-        return tuple(
-            ShardOrderEntry(tensor_dim=tensor_dim, mesh_dims=tuple(mesh_dims))
-            for tensor_dim, mesh_dims in shard_order.items()
-        )
-
    @with_comms
    def test_ordered_redistribute(self):
        """Test ordered redistribution with various sharding syntaxes"""
@ -927,13 +847,11 @@ class DistributeWithDeviceOrderTest(DTensorTestBase):
        for idx, ((src_placement, src_order), (dst_placement, dst_order)) in enumerate(
            sharding_src_dst_pairs_with_expected_trace
        ):
-            sharded_dt = self.distribute_tensor(
+            sharded_dt = _distribute_tensor(
                input_data.clone(), mesh, src_placement, shard_order=src_order
            )
            with DebugMode(record_torchfunction=False) as debug_mode:
-                sharded_dt = self.redistribute(
-                    sharded_dt, mesh, dst_placement, dst_order
-                )
+                sharded_dt = redistribute(sharded_dt, mesh, dst_placement, dst_order)
            trace_str = self._extract_redistribute_trace_from_debug_mode(
                debug_mode.debug_string()
            )
@ -957,49 +875,11 @@ class DistributeWithDeviceOrderTest(DTensorTestBase):
                    trace_str,
                    """S(0)[0]S(0)[1]R->S(0)S(1)R->RS(1)R->RS(1)S(0)""",
                )
-            expected_dt = self.distribute_tensor(
+            expected_dt = _distribute_tensor(
                input_data.clone(), mesh, dst_placement, shard_order=dst_order
            )
            self.assertEqual(sharded_dt.to_local(), expected_dt.to_local())

-    def generate_shard_orders(self, mesh, tensor_rank):
-        # Generate all possible sharding placement of tensor with rank
-        # `tensor_rank` over mesh.
-        def _split_list(lst: list, N: int):
-            def compositions(n, k):
-                if k == 1:
-                    yield [n]
-                else:
-                    for i in range(1, n - k + 2):
-                        for tail in compositions(n - i, k - 1):
-                            yield [i] + tail
-
-            length = len(lst)
-            for comp in compositions(length, N):
-                result = []
-                start = 0
-                for size in comp:
-                    result.append(lst[start : start + size])
-                    start += size
-                yield result
-
-        all_mesh = list(range(mesh.ndim))
-        all_device_order = list(itertools.permutations(all_mesh))
-        for device_order in all_device_order:
-            # split on device orders, and assign each device order segment to a tensor dim
-            for num_split in range(1, mesh.ndim + 1):
-                for splitted_list in _split_list(list(range(mesh.ndim)), num_split):
-                    for tensor_dims in itertools.combinations(
-                        range(tensor_rank), len(splitted_list)
-                    ):
-                        shard_order = {}
-                        assert len(tensor_dims) == len(splitted_list)
-                        for tensor_dim, mesh_dims in zip(tensor_dims, splitted_list):
-                            shard_order[tensor_dim] = device_order[
-                                mesh_dims[0] : mesh_dims[-1] + 1
-                            ]
-                        yield self._convert_shard_order_dict_to_ShardOrder(shard_order)
-
    @with_comms
    def test_generate_shard_orders(self):
        """Check if `generate_shard_orders` generates unique sharding combinations"""
@ -1012,7 +892,7 @@ class DistributeWithDeviceOrderTest(DTensorTestBase):
        ]
        for test_input in test_inputs:
            all_combinations = []
-            for shard_order in self.generate_shard_orders(
+            for shard_order in generate_shard_orders(
                test_input["mesh"], test_input["tensor_rank"]
            ):
                all_combinations.append(shard_order)  # noqa: PERF402
@ -1062,12 +942,12 @@ class DistributeWithDeviceOrderTest(DTensorTestBase):
            input_data = torch.randn(tensor_shape, device=self.device_type)
            tensor_rank = input_data.ndim
            with maybe_disable_local_tensor_mode():
-                shard_orders = self.generate_shard_orders(mesh, tensor_rank)
+                shard_orders = generate_shard_orders(mesh, tensor_rank)
            for shard_order in shard_orders:
-                sharded_dt = self.distribute_tensor(
+                sharded_dt = _distribute_tensor(
                    input_data.clone(), mesh, placements=None, shard_order=shard_order
                )
-                self.assertEqual(self.full_tensor(sharded_dt), input_data)
+                self.assertEqual(make_full_tensor(sharded_dt), input_data)

        # 2. Verify the correctness of redistribution from DTensor to DTensor.
        # This test repeatedly redistributes a DTensor to various ordered
@ -1078,20 +958,20 @@ class DistributeWithDeviceOrderTest(DTensorTestBase):
            tensor_rank = input_data.ndim
            prev_sharded_dt = None
            with maybe_disable_local_tensor_mode():
-                shard_orders = self.generate_shard_orders(mesh, tensor_rank)
+                shard_orders = generate_shard_orders(mesh, tensor_rank)
            for shard_order in shard_orders:
                if prev_sharded_dt is None:
-                    prev_sharded_dt = self.distribute_tensor(
+                    prev_sharded_dt = _distribute_tensor(
                        input_data.clone(),
                        mesh,
                        placements=None,
                        shard_order=shard_order,
                    )
                else:
-                    sharded_dt = self.redistribute(
+                    sharded_dt = redistribute(
                        prev_sharded_dt, mesh, placements=None, shard_order=shard_order
                    )
-                    self.assertEqual(self.full_tensor(sharded_dt), input_data)
+                    self.assertEqual(make_full_tensor(sharded_dt), input_data)
                    prev_sharded_dt = sharded_dt

    @with_comms
@ -1136,13 +1016,13 @@ class DistributeWithDeviceOrderTest(DTensorTestBase):
                local_tensor = torch.randn(shape, device=self.device_type)
                full_tensor = DTensor.from_local(local_tensor, mesh, placements)
                with maybe_disable_local_tensor_mode():
-                    shard_orders = self.generate_shard_orders(mesh, len(shape))
+                    shard_orders = generate_shard_orders(mesh, len(shape))
                for shard_order in shard_orders:
-                    sharded_dt = self.redistribute(
+                    sharded_dt = redistribute(
                        full_tensor, mesh, placements=None, shard_order=shard_order
                    )
                    self.assertEqual(
-                        self.full_tensor(sharded_dt), self.full_tensor(full_tensor)
+                        make_full_tensor(sharded_dt), make_full_tensor(full_tensor)
                    )

    @unittest.skip(
@ -1152,24 +1032,20 @@ class DistributeWithDeviceOrderTest(DTensorTestBase):
    @with_comms
    def test_ordered_redistribute_for_special_placement(self):
        """Test ordered redistribution with special placement"""
-        from torch.distributed.tensor._ops._embedding_ops import _MaskPartial
-
        torch.manual_seed(21)
        mesh = init_device_mesh(self.device_type, (8,))
        input_data = torch.randn((8, 8), device=self.device_type)
        src_placement = [Shard(1)]
        tgt_placement = [
-            (_MaskPartial(offset_shape=torch.Size([10, 20]), offset_dim=0),)
+            (MaskPartial(offset_shape=torch.Size([10, 20]), offset_dim=0),)
        ]
-        sharded_dt = self.distribute_tensor(
+        sharded_dt = _distribute_tensor(
            input_data.clone(),
            mesh,
            src_placement,
            shard_order=(ShardOrderEntry(tensor_dim=1, mesh_dims=(0,)),),
        )
-        sharded_dt = self.redistribute(
-            sharded_dt, mesh, tgt_placement, shard_order=None
-        )
+        sharded_dt = redistribute(sharded_dt, mesh, tgt_placement, shard_order=None)

    @with_comms
    def test_shard_order_same_data_as_strided_shard(self):
@ -1179,7 +1055,7 @@ class DistributeWithDeviceOrderTest(DTensorTestBase):
        strided_placement = [_StridedShard(-2, split_factor=2), Shard(-2)]
        x_strided_dt = distribute_tensor(x, device_mesh, strided_placement)
        # specify right-to-left order use ordered shard
-        x_ordered_dt = self.distribute_tensor(
+        x_ordered_dt = _distribute_tensor(
            x,
            device_mesh,
            placements=[Shard(0), Shard(0)],
--- a/test/distributed/tensor/test_utils.py
+++ b/test/distributed/tensor/test_utils.py
@ -34,6 +34,10 @@ from torch.distributed.tensor.placement_types import (
 from torch.testing._internal.common_utils import run_tests, TestCase
 from torch.testing._internal.distributed._tensor.common_dtensor import (
    DTensorTestBase,
+    generate_shard_orders,
+    LocalDTensorTestBase,
+    patched_distribute_tensor as _distribute_tensor,
+    shard_order_to_placement,
    with_comms,
 )

@ -774,6 +778,63 @@ class TestStridedSharding(DTensorTestBase):
            self.assertEqual(dtensor.full_tensor(), tensor)


+class Test_StridedShard_with_shard_order(LocalDTensorTestBase):
+    @property
+    def world_size(self) -> int:
+        return 32
+
+    @with_comms
+    def test_StridedShard_to_shard_order(self):
+        with LocalTensorMode(ranks=self.world_size):
+            mesh = DeviceMesh("cpu", torch.arange(self.world_size).view(2, 2, 2, 2, 2))
+            shard_iter = generate_shard_orders(mesh, 3)
+            # It takes ~4.8h to complete total 2520 shard order combinations here
+            # using LocalTensor. So we only randomly pick 25 shard orders to test.
+            all_shard_order = list(shard_iter)
+            import random
+
+            random.seed(42)
+            shard_order_choices = random.sample(
+                all_shard_order, min(25, len(all_shard_order))
+            )
+
+            x = torch.randn(32, 32, 32)
+            for shard_order in shard_order_choices:
+                a = _distribute_tensor(x, mesh, None, shard_order)
+
+                placement_without_stridedshard = shard_order_to_placement(
+                    shard_order, mesh
+                )
+                placements_with_stridedshard = (
+                    DTensorSpec._convert_shard_order_to_StridedShard(
+                        shard_order, placement_without_stridedshard, mesh
+                    )
+                )
+                b = distribute_tensor(x, mesh, placements_with_stridedshard)
+                shard_order_from_stridedshard = (
+                    DTensorSpec._maybe_convert_StridedShard_to_shard_order(
+                        placements_with_stridedshard, mesh
+                    )
+                )
+                self.assertEqual(shard_order, shard_order_from_stridedshard)
+                self.assertEqual(a.to_local(), b.to_local())
+
+    @with_comms
+    def test_StridedShard_not_convertible_to_shard_order(self):
+        with LocalTensorMode(ranks=self.world_size):
+            mesh = DeviceMesh("cpu", torch.arange(self.world_size).view(4, 8))
+            unconvertible_placements_list = [
+                [_StridedShard(0, split_factor=2), _StridedShard(1, split_factor=2)],
+                [_StridedShard(0, split_factor=2), Shard(1)],
+                [_StridedShard(1, split_factor=16), Shard(1)],
+            ]
+            for placements in unconvertible_placements_list:
+                shard_order = DTensorSpec._maybe_convert_StridedShard_to_shard_order(
+                    tuple(placements), mesh
+                )
+                self.assertIsNone(shard_order)
+
+
 class Test2DStridedLocalShard(DTensorTestBase):
    @property
    def world_size(self):
--- a/test/distributed/test_aten_comm_compute_reordering.py
+++ b/test/distributed/test_aten_comm_compute_reordering.py
@ -54,6 +54,7 @@ def apply_reordering_and_get_graph(graph, out_li) -> None:
        "max_compute_pre_fetch",
        "custom_runtime_estimation",
        "insert_overlap_deps",
+        "collective_estimator",
    )
    for key in config_keys:
        if (val := getattr(dist_opts, key)) is not None:
@ -943,6 +944,50 @@ class TestComputeCommReorderingBucketing(TestComputeCommReorderingMultiProc):
            correct = func(inputs_a, inputs_b, ranks=ranks)
            self.assertTrue(same(out, correct))

+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    def test_collective_benchmarking_with_real_pg(self):
+        """Test collective benchmarking with real process group (falls back on fake)."""
+
+        def func(a):
+            # Test all three collective types with 8x8 (power of 2 size = 256 elements = 1024 bytes for fp32)
+            ar = _functional_collectives.all_reduce(a, "sum", "0")
+            ag = _functional_collectives.all_gather_tensor(
+                a, 0, list(range(self.world_size))
+            )
+            rs = _functional_collectives.reduce_scatter_tensor(a, "sum", 0, "0")
+
+            b = torch.matmul(a, a)
+            c = torch.matmul(ar, b)
+            return c.sum() + ag.sum() + rs.sum()
+
+        patches = {
+            **get_patches(),
+            "aten_distributed_optimizations.collective_estimator": "benchmark",
+            "aten_distributed_optimizations.custom_runtime_estimation": None,  # Remove custom estimation so benchmarking happens
+        }
+
+        with _dynamo_dist_per_rank_init(
+            self.rank,
+            self.world_size,
+            self.backend(device_type),
+            fake_pg=not at_least_x_gpu(2),
+        ):
+            inputs = torch.ones(8, 8, dtype=torch.float, device=device_type) + self.rank
+
+            with torch._inductor.config.patch(patches):
+                compiled = torch.compile(func)
+                out, aten_graph_str = run_and_get_aten_graph(compiled, inputs)
+
+                # Verify all three collective types are present
+                FileCheck().check("all_reduce").check("all_gather").check(
+                    "reduce_scatter"
+                ).run(aten_graph_str)
+
+                # Test passes if compilation succeeded with benchmarking enabled
+                # Cache verification is tricky due to multiprocess test setup
+                correct = func(inputs)
+                self.assertTrue(same(out, correct))
+
    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
    @torch._inductor.config.patch(get_bucket_patches())
    def test_multidtype_bucketing(self):
--- a/test/distributed/test_functional_api.py
+++ b/test/distributed/test_functional_api.py
@ -485,7 +485,7 @@ elif TEST_XPU:
 def exit_if_lt_x_accelerators(x):
    if torch.accelerator.is_available():
        if torch.accelerator.device_count() < x:
-            sys.exit(TEST_SKIPS[f"multi-accelerator-{x}"].exit_code)
+            sys.exit(TEST_SKIPS[f"multi-gpu-{x}"].exit_code)


 def with_comms(func=None):
--- a/test/dynamo/test_activation_checkpointing.py
+++ b/test/dynamo/test_activation_checkpointing.py
@ -1,4 +1,6 @@
 # Owner(s): ["module: dynamo"]
+# flake8: noqa: B950
+# flake8: noqa: E731
 import contextlib
 import copy
 import functools
@ -15,7 +17,11 @@ import torch.nn as nn
 import torch.utils.checkpoint
 from functorch.compile import min_cut_rematerialization_partition
 from torch._dynamo.backends.common import aot_autograd
-from torch._dynamo.testing import CompileCounterWithBackend
+from torch._dynamo.testing import (
+    AotEagerAndRecordGraphs,
+    CompileCounterWithBackend,
+    normalize_gm,
+)
 from torch._higher_order_ops.wrap import tag_activation_checkpoint
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_utils import IS_WINDOWS, skipIfHpu
@ -1649,6 +1655,43 @@ Non-primal fwd outputs from model w/o backward hook: {mod_no_hook_fwd_outputs_no

        self.assertEqual(opt_fn(x), fn(x))

+    def test_return_same_element_twice(self):
+        def gn(x):
+            y = torch.sin(x)
+            return y, y
+
+        def fn(x):
+            return torch.utils.checkpoint.checkpoint(gn, x, use_reentrant=True)
+
+        x = torch.randn(4, 4, requires_grad=True)
+        ref = fn(x)
+
+        backend = AotEagerAndRecordGraphs()
+        opt_fn = torch.compile(fn, backend=backend, fullgraph=True)
+        res = opt_fn(x)
+        self.assertEqual(ref[0], res[0])
+        self.assertEqual(ref[1], res[1])
+
+        self.assertExpectedInline(
+            normalize_gm(backend.graphs[0].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[4, 4]"):
+        l_x_ = L_x_
+
+        wrap_body_0 = self.wrap_body_0
+        tag_activation_checkpoint = torch.ops.higher_order.tag_activation_checkpoint(wrap_body_0, l_x_, use_reentrant = True);  wrap_body_0 = l_x_ = None
+        getitem: "f32[4, 4]" = tag_activation_checkpoint[0]
+        getitem_1: "f32[4, 4]" = tag_activation_checkpoint[1];  tag_activation_checkpoint = None
+        return (getitem, getitem_1)
+
+    class wrap_body_0(torch.nn.Module):
+        def forward(self, l_x_: "f32[4, 4]"):
+            y: "f32[4, 4]" = torch.sin(l_x_);  l_x_ = None
+            return (y, y)
+""",
+        )
+
    @torch._dynamo.config.patch(skip_fwd_side_effects_in_bwd_under_checkpoint=True)
    def test_nonlocal_mutation(self):
        counter = 0
@ -1672,6 +1715,114 @@ Non-primal fwd outputs from model w/o backward hook: {mod_no_hook_fwd_outputs_no
        # The mutation is not reapplied in the backward because the flag was on.
        self.assertEqual(counter, 1)

+    @torch._dynamo.config.patch(skip_fwd_side_effects_in_bwd_under_checkpoint=True)
+    def test_nonlocal_list_mutation(self):
+        def gn(x, z):
+            out = x.sin()
+            z.append(out)
+            return torch.cos(torch.sin(torch.matmul(x, x) @ x)), out
+
+        def fn(x):
+            z = []
+
+            out1, out2 = torch.utils.checkpoint.checkpoint(
+                gn,
+                x,
+                z,
+                use_reentrant=False,
+            )
+
+            return out1, z[0]
+
+        x = torch.randn(4, 4, requires_grad=True)
+        ref = fn(x)
+
+        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
+        res = opt_fn(x)
+        self.assertEqual(ref[0], res[0])
+        self.assertEqual(ref[1], res[1])
+
+    @torch._dynamo.config.patch(skip_fwd_side_effects_in_bwd_under_checkpoint=True)
+    def test_nonlocal_list_mutation_hidden(self):
+        def gn(x, z):
+            o = torch.matmul(x, x) @ x
+            out = x.sin()
+            z.append(out)
+            return torch.cos(torch.sin(o)), torch.sin(x)
+
+        def fn(x):
+            z = []
+
+            outs = torch.utils.checkpoint.checkpoint(
+                gn,
+                x,
+                z,
+                use_reentrant=False,
+            )
+            out1 = outs[0]
+            # Check that the extra output pytree handling is done properly
+            out2 = outs[-1]
+
+            return out1 + out2, z[0]
+
+        x = torch.randn(4, 4, requires_grad=True)
+        ref = fn(x)
+
+        backend = AotEagerAndRecordGraphs()
+        opt_fn = torch.compile(fn, backend=backend, fullgraph=True)
+        res = opt_fn(x)
+        self.assertEqual(ref[0], res[0])
+        self.assertEqual(ref[1], res[1])
+
+        self.assertExpectedInline(
+            normalize_gm(backend.graphs[0].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, L_x_: "f32[4, 4]"):
+        l_x_ = L_x_
+
+        wrap_body_0 = self.wrap_body_0
+        tag_activation_checkpoint = torch.ops.higher_order.tag_activation_checkpoint(wrap_body_0, l_x_, use_reentrant = False);  wrap_body_0 = l_x_ = None
+        out1: "f32[4, 4]" = tag_activation_checkpoint[0]
+        out2: "f32[4, 4]" = tag_activation_checkpoint[1]
+        getitem_4: "f32[4, 4]" = tag_activation_checkpoint[4];  tag_activation_checkpoint = None
+
+        add: "f32[4, 4]" = out1 + out2;  out1 = out2 = None
+        return (add, getitem_4)
+
+    class wrap_body_0(torch.nn.Module):
+        def forward(self, l_x_: "f32[4, 4]"):
+            matmul: "f32[4, 4]" = torch.matmul(l_x_, l_x_)
+            o: "f32[4, 4]" = matmul @ l_x_
+
+            out: "f32[4, 4]" = l_x_.sin()
+
+            sin_1: "f32[4, 4]" = torch.sin(o)
+            child: "f32[4, 4]" = torch.cos(sin_1)
+            child_1: "f32[4, 4]" = torch.sin(l_x_);  l_x_ = None
+            return (child, child_1, matmul, o, out, sin_1)
+""",
+        )
+
+        self.assertExpectedInline(
+            normalize_gm(backend.fw_graphs[0].print_readable(print_output=False)),
+            """\
+class GraphModule(torch.nn.Module):
+    def forward(self, primals_1: "f32[4, 4]"):
+        mm: "f32[4, 4]" = torch.ops.aten.mm.default(primals_1, primals_1)
+        mm_1: "f32[4, 4]" = torch.ops.aten.mm.default(mm, primals_1);  mm = None
+
+        sin: "f32[4, 4]" = torch.ops.aten.sin.default(primals_1)
+
+        sin_1: "f32[4, 4]" = torch.ops.aten.sin.default(mm_1);  mm_1 = None
+        cos: "f32[4, 4]" = torch.ops.aten.cos.default(sin_1);  sin_1 = None
+        sin_2: "f32[4, 4]" = torch.ops.aten.sin.default(primals_1)
+
+        add: "f32[4, 4]" = torch.ops.aten.add.Tensor(cos, sin_2);  cos = sin_2 = None
+        return (add, sin, primals_1)
+""",
+        )
+

 devices = ["cuda", "hpu"]
 instantiate_device_type_tests(
--- a/test/dynamo/test_decorators.py
+++ b/test/dynamo/test_decorators.py
@ -2109,6 +2109,89 @@ Detected recompile when torch.compile stance is 'fail_on_recompile'. filename: '
        with self.assertRaises(Unsupported):
            outer_f2(inp)

+    def test_disable_recursive_flags(self):
+        class SimpleLinear(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.layer0 = torch.nn.Linear(4, 4)
+
+            def forward(self, inp):
+                return self.layer0(torch.sigmoid(inp))
+
+        class SimpleModel(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.layer0 = SimpleLinear()
+                self.layer1 = torch.nn.Linear(4, 4)
+
+            def forward(self, inp):
+                z = self.layer0(torch.sin(inp))
+                return self.layer1(z)
+
+        for recursive_flag in [True, False]:
+            model = SimpleModel()
+            other_model = SimpleModel()
+
+            model.forward = torch._dynamo.disable(
+                model.forward,
+                recursive=recursive_flag,
+            )
+            self.assertEqual(
+                torch._dynamo.is_dynamo_disable_recursive(model.forward),
+                recursive_flag,
+            )
+
+            other_model = torch._dynamo.disable(other_model, recursive=recursive_flag)
+            self.assertEqual(
+                torch._dynamo.is_dynamo_disable_recursive(
+                    other_model.forward
+                    if isinstance(other_model, torch.nn.Module)
+                    else other_model
+                ),
+                recursive_flag,
+            )
+
+            # check the model is compilable
+            torch.compile(model)
+            torch.compile(other_model)
+
+    def test_dynamo_disable_annotations(self):
+        class SimpleModel(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.register_buffer("buffer", torch.rand(2, 2))
+
+            @torch._dynamo.disable()
+            def f1(self, x) -> torch.Tensor:
+                return x + self.buffer + 1
+
+            @torch._dynamo.disable()
+            def f2(self, x) -> torch.Tensor:
+                return x + self.buffer + 2
+
+            def forward(self, x) -> torch.Tensor:
+                return self.f1(x) + self.f2(x)
+
+        model = SimpleModel()
+        inp = torch.rand(2, 2)
+        with torch.fx.traceback.preserve_node_meta():
+            exported_model = torch.export.export(model, (inp,))
+        graph = exported_model.graph_module.graph
+        found_f1 = False
+        found_f2 = False
+        for node in graph.nodes:
+            if "custom" in node.meta:
+                if "_torchdynamo_disable_method" in node.meta["custom"]:
+                    if node.meta["custom"]["_torchdynamo_disable_method"] == "f1":
+                        found_f1 = True
+                    elif node.meta["custom"]["_torchdynamo_disable_method"] == "f2":
+                        found_f2 = True
+        self.assertTrue(found_f1)
+        self.assertTrue(found_f2)
+        model.forward = torch._dynamo.disable(model.forward, recursive=False)
+        with self.assertRaises(RuntimeError):
+            exported_model = torch.export.export(model, (inp,))
+

 if __name__ == "__main__":
    from torch._dynamo.test_case import run_tests
--- a/Show More
+++ b/Show More