[dynamo] Add most recent bytecode to graph break with developer initiation

ghstack-source-id: 8b538f2e1ac703a4538468a758f08db0c89b91a7 Pull Request resolved: https://github.com/pytorch/pytorch/pull/163720 Add most recent bytecode to dynamo graph break called by user Fix other user-initiated graph break and issues Fix linter
Add num_store to inductor_meta and use it to scale persistent reduction x block (#162446 )
2025-10-28 10:34:54 +08:00 · 2025-10-01 17:21:03 -07:00 · 2025-09-23 20:36:39 +00:00 · 2025-09-23 18:55:35 +00:00 · 2025-09-23 18:20:04 +00:00 · 2025-09-23 17:58:18 +00:00
269 changed files with 8582 additions and 2195 deletions
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
@ -241,7 +241,7 @@ def wait_for_connection(addr, port, timeout=15, attempt_cnt=5):
        try:
            with socket.create_connection((addr, port), timeout=timeout):
                return
-        except (ConnectionRefusedError, socket.timeout):  # noqa: PERF203
+        except (ConnectionRefusedError, TimeoutError):  # noqa: PERF203
            if i == attempt_cnt - 1:
                raise
            time.sleep(timeout)
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -262,13 +262,10 @@ case "$tag" in
    TRITON_CPU=yes
    ;;
  pytorch-linux-jammy-linter)
-    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
-    # We will need to update mypy version eventually, but that's for another day. The task
-    # would be to upgrade mypy to 1.0.0 with Python 3.11
-    PYTHON_VERSION=3.9
+    PYTHON_VERSION=3.10
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter)
-    PYTHON_VERSION=3.9
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter)
+    PYTHON_VERSION=3.10
    CUDA_VERSION=12.8.1
    ;;
  pytorch-linux-jammy-aarch64-py3.10-gcc11)
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-5ae38bdb0dc066c5823e34dc9797afb9de42c866
+bbb06c0334a6772b92d24bde54956e675c8c6604
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,7 +1,7 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@d53b0ffb9b1cda68260693ea98f3483823c88d8e#egg=pytorch_sphinx_theme2

 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
--- a/.ci/lumen_cli/cli/lib/core/vllm/lib.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/lib.py
@ -72,7 +72,7 @@ def sample_vllm_test_library():
                    ]
                ),
                "pytest -v -s entrypoints/llm/test_generate.py",
-                "VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode",
+                "pytest -v -s entrypoints/offline_mode",
            ],
        },
        "vllm_regression_test": {
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -334,11 +334,17 @@ test_python() {
 }

 test_python_smoke() {
-  # Smoke tests for H100
+  # Smoke tests for H100/B200
  time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  assert_git_not_dirty
 }

+test_python_smoke_b200() {
+  # Targeted smoke tests for B200 - staged approach to avoid too many failures
+  time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  assert_git_not_dirty
+}
+
 test_h100_distributed() {
  # Distributed tests at H100
  time python test/run_test.py --include distributed/_composable/test_composability/test_pp_composability.py  $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
@ -1773,6 +1779,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
  test_xpu_bin
 elif [[ "${TEST_CONFIG}" == smoke ]]; then
  test_python_smoke
+elif [[ "${TEST_CONFIG}" == smoke_b200 ]]; then
+  test_python_smoke_b200
 elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
  test_h100_distributed
 elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-9d1c50a5ac8726f4af0d4a4e85ad4d26a674ad26
+090197034faf3b193c4467cedeb9281e3078892d
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -36,6 +36,7 @@ ciflow_push_tags:
 - ciflow/win-arm64
 - ciflow/h100-symm-mem
 - ciflow/h100-cutlass-backend
+- ciflow/b200
 retryable_workflows:
 - pull
 - trunk
--- a/.github/workflows/_get-changed-files.yml
+++ b/.github/workflows/_get-changed-files.yml
@ -2,6 +2,12 @@ name: Get Changed Files

 on:
  workflow_call:
+    inputs:
+      all_files:
+        description: "Whether to return all files instead of just changed files"
+        required: false
+        type: boolean
+        default: false
    outputs:
      changed-files:
        description: "List of changed files (space-separated) or '*' if not in a PR"
@ -26,17 +32,23 @@ jobs:
            # Get the PR number from the github context
            PR_NUMBER="${{ github.event.number }}"

-            # Use gh CLI to get changed files in the PR with explicit repo
-            CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')
+            # Check if all_files is requested
+            if [ "${{ inputs.all_files }}" = "true" ]; then
+              echo "all_files input is true, returning all files"
+              echo "changed-files=*" >> "$GITHUB_OUTPUT"
+            else
+              # Use gh CLI to get changed files in the PR with explicit repo
+              CHANGED_FILES=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/files --paginate --jq '.[] | select(.status != "removed") | .filename' | tr '\n' ' ' | sed 's/ $//')

-            if [ -z "$CHANGED_FILES" ]; then
-              echo "No changed files found, setting to '*'"
-              CHANGED_FILES="*"
+              if [ -z "$CHANGED_FILES" ]; then
+                echo "No changed files found, setting to '*'"
+                CHANGED_FILES="*"
+              fi
+
+              echo "Changed files: $CHANGED_FILES"
+              echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT"
            fi

-            echo "Changed files: $CHANGED_FILES"
-            echo "changed-files=$CHANGED_FILES" >> "$GITHUB_OUTPUT"
-
          else
            echo "Not in PR context, setting changed files to '*'"
            echo "changed-files=*" >> "$GITHUB_OUTPUT"
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -70,7 +70,7 @@ jobs:
          pytorch-linux-jammy-py3-clang18-asan,
          pytorch-linux-jammy-py3-clang12-onnx,
          pytorch-linux-jammy-linter,
-          pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter,
          pytorch-linux-jammy-py3-clang12-executorch,
          pytorch-linux-jammy-py3.12-triton-cpu,
          pytorch-linux-noble-riscv64-py3.12-gcc14
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -31,6 +31,8 @@ jobs:
    if: github.repository_owner == 'pytorch'
    name: Get changed files
    uses: ./.github/workflows/_get-changed-files.yml
+    with:
+      all_files: ${{ contains(github.event.pull_request.labels.*.name, 'lint-all-files') || contains(github.event.pull_request.labels.*.name, 'Reverted') }}

  lintrunner-clang:
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@ -53,7 +55,7 @@ jobs:
    with:
      timeout: 120
      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
-      docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter
+      docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter
      # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout
      # to run git rev-parse HEAD~:.ci/docker when a new image is needed
      fetch-depth: 0
@ -264,10 +266,10 @@ jobs:
        with:
          submodules: false
          fetch-depth: 1
-      - name: Setup Python 3.9
+      - name: Setup Python 3.10
        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
        with:
-          python-version: '3.9'
+          python-version: '3.10'
          architecture: x64
          cache: pip
      - name: Install dependencies
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -127,8 +127,6 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      # More memory is needed to build with asan
-      runner: linux.2xlarge.memory
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.10-clang18-asan
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -140,8 +140,6 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      # More memory is needed to build with asan
-      runner: linux.2xlarge.memory
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.10-clang18-asan
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
--- a/.github/workflows/test-b200.yml
+++ b/.github/workflows/test-b200.yml
@ -0,0 +1,76 @@
+# B200 Smoke Tests CI Workflow
+#
+# This workflow runs smoke tests on B200 hardware
+#
+# Flow:
+# 1. Builds PyTorch with CUDA 12.8+ and sm100 architecture for B200
+# 2. Runs smoke tests on linux.dgx.b200 runner
+# 3. Tests executed are defined in .ci/pytorch/test.sh -> test_python_smoke() function
+#
+# Triggered by:
+# - Pull requests modifying this workflow file
+# - Manual dispatch
+# - Schedule (every 6 hours)
+# - Adding ciflow/b200 label to a PR (creates ciflow/b200/* tag)
+
+name: B200 Smoke Tests
+
+on:
+  pull_request:
+    paths:
+      - .github/workflows/test-b200.yml
+  workflow_dispatch:
+  schedule:
+    - cron: 0 4,10,16,22 * * *  # every 6 hours
+  push:
+    tags:
+      - ciflow/b200/*
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+
+  get-label-type:
+    if: github.repository_owner == 'pytorch'
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
+  linux-jammy-cuda12_8-py3_10-gcc11-sm100-build:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm100
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runner: linux.12xlarge.memory
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '10.0'
+      test-matrix: |
+        { include: [
+          { config: "smoke_b200", shard: 1, num_shards: 1, runner: "linux.dgx.b200" },
+        ]}
+      # config: "smoke_b200" maps to test_python_smoke_b200() in .ci/pytorch/test.sh
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc11-sm100-test:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm100
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-sm100-build
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm100-build.outputs.test-matrix }}
+      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+    secrets: inherit
--- a/.github/workflows/unstable.yml
+++ b/.github/workflows/unstable.yml
@ -53,27 +53,3 @@ jobs:
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
-
-  linux-jammy-py3_9-clang9-xla-build:
-    name: linux-jammy-py3_9-clang9-xla
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-clang9-xla
-      docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
-      test-matrix: |
-        { include: [
-          { config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-py3_9-clang9-xla-test:
-    name: linux-jammy-py3_9-clang9-xla
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-py3_9-clang9-xla-build
-    with:
-      build-environment: linux-jammy-py3.9-clang9-xla
-      docker-image: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_9-clang9-xla-build.outputs.test-matrix }}
-    secrets: inherit
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -196,6 +196,7 @@ exclude_patterns = [
    'tools/test/gen_operators_yaml_test.py',
    'tools/test/gen_oplist_test.py',
    'tools/test/test_selective_build.py',
+    'tools/experimental/dynamic_shapes/torchfuzz/**',
 ]
 command = [
    'python3',
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,5 +1,4 @@
 cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
-# cmake_policy(SET CMP0022 NEW) cmake_policy(SET CMP0023 NEW)

 # Use compiler ID "AppleClang" instead of "Clang" for XCode. Not setting this
 # sometimes makes XCode C compiler gets detected as "Clang", even when the C++
@ -1486,4 +1485,4 @@ else()
    To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
    ]])
  endif()
-endif()
+endif()
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -317,10 +317,20 @@ IF(USE_FBGEMM_GENAI)
        -greedy-reverse-local-assignment=1
        -fhip-new-launch-api)

+      # Only compile for gfx942 for now.
+      # This is rather hacky, I could not figure out a clean solution :(
+      set(HIP_CLANG_FLAGS_ORIGINAL ${HIP_CLANG_FLAGS})
+      string(REGEX REPLACE "--offload-arch=[^ ]*" "" FILTERED_HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS}")
+      if("gfx942" IN_LIST PYTORCH_ROCM_ARCH)
+        list(APPEND FILTERED_HIP_CLANG_FLAGS --offload-arch=gfx942;)
+      endif()
+      set(HIP_CLANG_FLAGS ${FILTERED_HIP_CLANG_FLAGS})
+
      hip_add_library(
        fbgemm_genai STATIC
        ${fbgemm_genai_native_rocm_hip}
        HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
+      set(HIP_CLANG_FLAGS ${HIP_CLANG_FLAGS_ORIGINAL})
      set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
      target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)

--- a/aten/src/ATen/DLConvertor.cpp
+++ b/aten/src/ATen/DLConvertor.cpp
@ -401,30 +401,13 @@ T* toDLPackImpl(const Tensor& src) {
  // The following code detects whether the src follows
  // a continuous pattern. If the src follows such pattern (common-case)
  // then we do not need to normalize the strides.
-  bool need_normalize_strides = false;
-  int64_t expected_stride = 1;
-  for (int i = src.dim() - 1; i >= 0; i--) {
-    // detect if we do not meet continuous pattern
-    // and the size is 1, so there is opportunity to normalize
-    if (src.stride(i) != expected_stride && src.size(i) == 1) {
-      need_normalize_strides = true;
-      break;
-    }
-    expected_stride *= src.size(i);
-  }
-
+  bool need_normalize_strides = src.dim() == 1 && src.size(0) == 1 && src.stride(0) != 1;
  // less common case, try normalizing the strides
  if (need_normalize_strides) {
    // create a new tensor with possibly normalized strides
    // gh-83069
    auto shape = src.sizes();
-    auto strides = src.strides().vec();
-    for (int i = 0; i < src.dim(); i++) {
-      if (shape[i] < 2) {
-        strides[i] = 1;
-      }
-    }
-    view = src.as_strided(shape, strides, src.storage_offset());
+    view = src.as_strided(shape, {1}, src.storage_offset());
  }

  ATenDLMTensor<T>* atDLMTensor(new ATenDLMTensor<T>);
--- a/aten/src/ATen/InferSize.h
+++ b/aten/src/ATen/InferSize.h
@ -94,10 +94,10 @@ inline at::DimVector infer_size_dv(IntArrayRef shape, int64_t numel) {

 inline at::SymDimVector infer_size_dv(
    c10::SymIntArrayRef shape,
-    const c10::SymInt& numel) {
+    c10::SymInt numel) {
  auto res = at::SymDimVector(shape);
  infer_size_impl<c10::SymIntArrayRef, c10::SymInt, at::SymDimVector>(
-      shape, numel, res);
+      shape, std::move(numel), res);
  return res;
 }

--- a/aten/src/ATen/core/Dict.h
+++ b/aten/src/ATen/core/Dict.h
@ -6,6 +6,7 @@
 #include <c10/util/TypeList.h>
 #include <c10/util/intrusive_ptr.h>
 #include <c10/util/order_preserving_flat_hash_map.h>
+#include <optional>
 #include <ATen/core/TensorBody.h>
 #include <ATen/core/jit_type_base.h>

--- a/aten/src/ATen/core/dispatch/CppSignature.h
+++ b/aten/src/ATen/core/dispatch/CppSignature.h
@ -55,7 +55,8 @@ class TORCH_API CppSignature final {
  }

 private:
-  explicit CppSignature(std::type_index signature) : signature_(signature) {}
+  explicit CppSignature(std::type_index signature)
+      : signature_(std::move(signature)) {}
  std::type_index signature_;
 };

--- a/aten/src/ATen/core/dispatch/Dispatcher.cpp
+++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp
@ -70,7 +70,7 @@ private:
 void _print_dispatch_trace(const std::string& label, const std::string& op_name, const DispatchKeySet& dispatchKeySet) {
  auto nesting_value = dispatch_trace_nesting_value();
  for (int64_t i = 0; i < nesting_value; ++i) std::cerr << " ";
-  std::cerr << label << " op=[" << op_name << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << "]" << '\n';
+  std::cerr << label << " op=[" << op_name << "], key=[" << toString(dispatchKeySet.highestPriorityTypeId()) << "]" << std::endl;
 }
 } // namespace detail

@ -213,11 +213,9 @@ OperatorHandle Dispatcher::findOrRegisterName_(const OperatorName& op_name) {
 // Windows build doesn't produce the destructor symbol in PyTorch libs
 // causing a linker failure in downstream projects.
 // x-ref https://github.com/pytorch/pytorch/issues/70032
-#if defined(_WIN32)
 OperatorHandle::~OperatorHandle() = default;
-#endif

-RegistrationHandleRAII Dispatcher::registerLibrary(const std::string& ns, std::string debug) {
+RegistrationHandleRAII Dispatcher::registerLibrary(std::string ns, std::string debug) {
  std::lock_guard<std::mutex> lock(guard_->mutex);
  auto found = libraries_.find(ns);
  TORCH_CHECK(
@ -308,7 +306,7 @@ PythonModuleMapType& pythonModulesSingleton() {

 }

-std::optional<std::pair<const char*, const char*>> Dispatcher::getPyStub(const OperatorName& op_name) {
+std::optional<std::pair<const char*, const char*>> Dispatcher::getPyStub(OperatorName op_name) {
  std::lock_guard<std::mutex> lock(guard_->mutex);
  auto found = pythonModulesSingleton().find(op_name);
  if (found == pythonModulesSingleton().end()) {
@ -344,7 +342,7 @@ RegistrationHandleRAII Dispatcher::registerPythonModule(
  });
 }

-void Dispatcher::throwIfHasPythonModule(const OperatorName& op_name) {
+void Dispatcher::throwIfHasPythonModule(OperatorName op_name) {
  std::lock_guard<std::mutex> lock(guard_->mutex);
  auto elt = pythonModulesSingleton().find(op_name);
  if (elt == pythonModulesSingleton().end()) {
@ -364,7 +362,7 @@ void Dispatcher::throwIfHasPythonModule(const OperatorName& op_name) {
 }

 RegistrationHandleRAII Dispatcher::registerImpl(
-  const OperatorName& op_name,
+  OperatorName op_name,
  std::optional<DispatchKey> dispatch_key,
  KernelFunction kernel,
  std::optional<impl::CppSignature> cpp_signature,
@ -379,7 +377,7 @@ RegistrationHandleRAII Dispatcher::registerImpl(
    *this,
    dispatch_key,
    std::move(kernel),
-    cpp_signature,
+    std::move(cpp_signature),
    std::move(inferred_function_schema),
    std::move(debug)
  );
@ -408,7 +406,7 @@ void Dispatcher::deregisterImpl_(const OperatorHandle& op, const OperatorName& o
  cleanup(op, op_name);
 }

-RegistrationHandleRAII Dispatcher::registerName(const OperatorName& op_name) {
+RegistrationHandleRAII Dispatcher::registerName(OperatorName op_name) {
  std::lock_guard<std::mutex> lock(guard_->mutex);
  auto op = findOrRegisterName_(op_name);
  ++op.operatorDef_->def_and_impl_count;
--- a/aten/src/ATen/core/dispatch/Dispatcher.h
+++ b/aten/src/ATen/core/dispatch/Dispatcher.h
@ -13,10 +13,15 @@
 #include <condition_variable>
 #include <list>
 #include <mutex>
+#include <type_traits>

 #include <ATen/core/enum_tag.h>
 #include <ATen/core/grad_mode.h>

+#ifndef NDEBUG
+#include <iostream>
+#endif
+
 namespace c10 {

 TORCH_API bool show_dispatch_trace();
@ -250,7 +255,7 @@ class TORCH_API Dispatcher final {
  // NB: steals the inferred function schema, as we may need to hold on to
  // it for a bit until the real schema turns up
  RegistrationHandleRAII registerImpl(
-      const OperatorName& op_name,
+      OperatorName op_name,
      std::optional<DispatchKey> dispatch_key,
      KernelFunction kernel,
      std::optional<impl::CppSignature> cpp_signature,
@ -269,15 +274,15 @@ class TORCH_API Dispatcher final {
  /**
   * Given an operator, throws if we have a pystub.
   */
-  void throwIfHasPythonModule(const OperatorName& op_name);
+  void throwIfHasPythonModule(OperatorName op_name);

  std::optional<std::pair<const char*, const char*>> getPyStub(
-      const OperatorName& op_name);
+      OperatorName op_name);

  /**
   * Register a new operator by name.
   */
-  RegistrationHandleRAII registerName(const OperatorName& op_name);
+  RegistrationHandleRAII registerName(OperatorName op_name);

  /**
   * Register a fallback kernel for a backend.
@ -295,9 +300,7 @@ class TORCH_API Dispatcher final {
   * API.  These invocations are only permitted once per program, so we raise
   * an error if this is called again for the same namespace.
   */
-  RegistrationHandleRAII registerLibrary(
-      const std::string& ns,
-      std::string debug);
+  RegistrationHandleRAII registerLibrary(std::string ns, std::string debug);

  // ------------------------------------------------------------------------
  //
@ -445,12 +448,8 @@ class TORCH_API OperatorHandle {
  OperatorHandle& operator=(OperatorHandle&&) noexcept = default;
  OperatorHandle(const OperatorHandle&) = default;
  OperatorHandle& operator=(const OperatorHandle&) = default;
-#if defined(_WIN32)
  // NOLINTNEXTLINE(performance-trivially-destructible)
  ~OperatorHandle();
-#else
-  ~OperatorHandle() = default;
-#endif

  const OperatorName& operator_name() const {
    return operatorDef_->op.operator_name();
--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@ -556,7 +556,7 @@ inline std::ostream& operator<<(std::ostream& out, const Argument& arg) {
  // real_type versus fake_type: in order to be compatible with FunctionSchema
  // parser, printing an argument with either MemoryFormat or Layout type should
  // give us the original schema string, hence printing out real_type.
-  const auto& type = arg.real_type();
+  auto type = arg.real_type();
  bool is_opt = type->kind() == OptionalType::Kind;
  auto unopt_type = is_opt ? type->castRaw<OptionalType>()->getElementType() : type;

--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@ -232,7 +232,7 @@ struct TORCH_API OptionalType : public UnionType {
  static TypePtr ofTensor();
  //
  // global singleton
-  static TypePtr get(const TypePtr& inner);
+  static TypePtr get(TypePtr inner);

 private:
  explicit OptionalType(const TypePtr& contained);
@ -895,7 +895,7 @@ struct TORCH_API ListType
  // the type List<T>.
  // The extra "identifier" argument is needed beccause we have multiple container types
  // that all re-use this function (List<T>, array<T, N>, etc.)
-  static TypePtr get(const std::string& identifier, const TypePtr& inner);
+  static TypePtr get(const std::string& identifier, TypePtr inner);

  // common cast List[Tensor]
  static ListTypePtr ofTensors();
--- a/aten/src/ATen/core/type.cpp
+++ b/aten/src/ATen/core/type.cpp
@ -274,7 +274,7 @@ ListTypePtr ListType::ofNumbers() {
  return value;
 }

-TypePtr OptionalType::get(const TypePtr& inner) {
+TypePtr OptionalType::get(TypePtr inner) {
  static ska::flat_hash_map<TypePtr, TypePtr> containerTypePtrs;
  static std::mutex mutex;
  // Perf from the lock is ok because this function is guarded behind
@ -287,7 +287,7 @@ TypePtr OptionalType::get(const TypePtr& inner) {
  return containerTypePtrs[inner];
 }

-TypePtr ListType::get(const std::string& identifier, const TypePtr& inner) {
+TypePtr ListType::get(const std::string& identifier, TypePtr inner) {
  static ska::flat_hash_map<std::tuple<std::string, TypePtr>, TypePtr> containerTypePtrs;
  static std::mutex mutex;
  // Perf from the lock is ok because this function is guarded behind
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -1637,9 +1637,7 @@ bool gemm_and_bias(
  if (activation == GEMMAndBiasActivationEpilogue::RELU) {
    epilogue = CUBLASLT_EPILOGUE_RELU_BIAS;
  } else if (activation == GEMMAndBiasActivationEpilogue::GELU) {
-#if CUDA_VERSION >= 11040 || defined(USE_ROCM)
    epilogue = CUBLASLT_EPILOGUE_GELU_BIAS;
-#endif
  }

  if (bias != nullptr) {
@ -1931,7 +1929,6 @@ void scaled_gemm(
    bool use_fast_accum) {
  // Note: see `cublasCommonArgs` for various non-intuitive manupulations
  // of input arguments to this function.
-#if CUDA_VERSION >= 11080 || defined(USE_ROCM)
  const auto computeType = CUBLAS_COMPUTE_32F;
  const auto scaleType = CUDA_R_32F;
  const float alpha_val = 1.0;
@ -2133,8 +2130,6 @@ void scaled_gemm(
      " scaleType ",
      scaleType);
  return;
-#endif // if CUDA_VERSION >= 11080 || defined(USE_ROCM)
-  TORCH_CHECK(false, "scaled_gemm is only supported for CUDA 11.8 and above");
 }

 void int8_gemm(
--- a/aten/src/ATen/cuda/detail/DeviceThreadHandles.h
+++ b/aten/src/ATen/cuda/detail/DeviceThreadHandles.h
@ -122,7 +122,7 @@ struct DeviceThreadHandlePool : public std::enable_shared_from_this<DeviceThread

    // Called by the destructor.  Releases this thread's handles back into the pool.
    void release() {
-        if(!my_handles.empty()) {
+        if(my_handles.size() > 0) {
            auto parent = weak_parent.lock();
            if (!parent) {
                // If this thread exits after atexit handlers have completed, the
--- a/aten/src/ATen/functorch/ADInterpreters.cpp
+++ b/aten/src/ATen/functorch/ADInterpreters.cpp
@ -139,7 +139,7 @@ static void autogradBasedTransformSendToNext(
  std::bitset<default_bitset_size> outputs_aliasing_immutable; // set = 1 for all bits
  if(!grad_special_case) {
    for (auto idx = stack->size() - args_size; idx < stack->size(); idx++) {
-      const auto& ivalue = (*stack)[idx];
+      const auto ivalue = (*stack)[idx];
      if (!ivalue.isTensor()) {
        continue; // only input that can be aliased is a tensor, not a tensor list (expect in ops without returns)
      }
--- a/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
+++ b/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp
@ -6,8 +6,6 @@

 #include <ATen/functorch/BatchRulesHelper.h>

-#include <algorithm>
-
 namespace at::functorch {

 typedef std::tuple<Tensor, std::optional<int64_t>> oneOutput;
@ -317,7 +315,7 @@ oneOutput linalg_lu_solve_batch_rule(
  const auto LU_num_batch_dims = rankWithoutBatchDim(LU_, LU_bdim) - LU_min_rank;
  const auto pivots_num_batch_dims = rankWithoutBatchDim(pivots_, pivots_bdim) - pivots_min_rank;
  const auto B_num_batch_dims = rankWithoutBatchDim(B_, B_bdim) - B_min_rank;
-  const auto max_num_batch_dims = std::max({LU_num_batch_dims, pivots_num_batch_dims, B_num_batch_dims});
+  const auto max_num_batch_dims = std::max(std::max(LU_num_batch_dims, pivots_num_batch_dims), B_num_batch_dims);

  LU_ = maybePadToLogicalRank(LU_, LU_bdim, max_num_batch_dims + LU_min_rank);
  pivots_ = maybePadToLogicalRank(pivots_, pivots_bdim, max_num_batch_dims + pivots_min_rank);
--- a/aten/src/ATen/native/BinaryOps.cpp
+++ b/aten/src/ATen/native/BinaryOps.cpp
@ -897,11 +897,11 @@ Tensor& div_(Tensor& self, const Scalar& other) {
 }

 Tensor div(const Tensor& self, const Scalar& other, std::optional<std::string_view> rounding_mode) {
-  return self.div(wrapped_scalar_tensor(other), rounding_mode); // redispatch!
+  return self.div(wrapped_scalar_tensor(other), std::move(rounding_mode)); // redispatch!
 }

 Tensor& div_(Tensor& self, const Scalar& other, std::optional<std::string_view> rounding_mode) {
-  return self.div_(wrapped_scalar_tensor(other), rounding_mode); // redispatch!
+  return self.div_(wrapped_scalar_tensor(other), std::move(rounding_mode)); // redispatch!
 }

 // divide, alias for div
@ -926,23 +926,23 @@ Tensor& divide_(Tensor& self, const Scalar& other) {
 }

 Tensor& divide_out(const Tensor& self, const Tensor& other, std::optional<std::string_view> rounding_mode, Tensor& result) {
-  return at::div_out(result, self, other, rounding_mode);
+  return at::div_out(result, self, other, std::move(rounding_mode));
 }

 Tensor divide(const Tensor& self, const Tensor& other, std::optional<std::string_view> rounding_mode) {
-  return self.div(other, rounding_mode);
+  return self.div(other, std::move(rounding_mode));
 }

 Tensor& divide_(Tensor& self, const Tensor& other, std::optional<std::string_view> rounding_mode) {
-  return self.div_(other, rounding_mode);
+  return self.div_(other, std::move(rounding_mode));
 }

 Tensor divide(const Tensor& self, const Scalar& other, std::optional<std::string_view> rounding_mode) {
-  return self.div(other, rounding_mode);
+  return self.div(other, std::move(rounding_mode));
 }

 Tensor& divide_(Tensor& self, const Scalar& other, std::optional<std::string_view> rounding_mode) {
-  return self.div_(other, rounding_mode);
+  return self.div_(other, std::move(rounding_mode));
 }

 // true_divide, an alias for div
--- a/aten/src/ATen/native/Histogram.cpp
+++ b/aten/src/ATen/native/Histogram.cpp
@ -150,7 +150,7 @@ void histogramdd_prepare_out(const Tensor& input, const std::vector<int64_t>& bi
 void histogramdd_prepare_out(const Tensor& input, TensorList bins,
        const Tensor& hist, const TensorList& bin_edges) {
    std::vector<int64_t> bin_ct(bins.size());
-    std::transform(bins.begin(), bins.end(), bin_ct.begin(), [](const Tensor& t) { return t.numel() - 1; });
+    std::transform(bins.begin(), bins.end(), bin_ct.begin(), [](Tensor t) { return t.numel() - 1; });
    histogramdd_prepare_out(input, bin_ct, hist, bin_edges);
 }

--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@ -360,7 +360,7 @@ Tensor einsum(std::string_view equation, TensorList operands, at::OptionalIntArr
  // to compute the number of dimensions covered by ellipsis.
  for(const auto i : c10::irange(num_ops)) {
    const auto& operand = operands[i];
-    const auto& labels = op_labels[i];
+    const auto labels = op_labels[i];
    const auto ndims = operand.dim();
    int64_t nlabels = static_cast<int64_t>(labels.size());
    bool has_ellipsis = false;
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@ -237,7 +237,7 @@ TORCH_META_FUNC(linalg_vector_norm)(const Tensor& self, const Scalar& scalar_ord
  at::detail::check_linalg_norm_dtype(opt_dtype, self.scalar_type(), "linalg.vector_norm");

  auto mask = at::native::make_dim_mask(dim, self.dim());
-  auto shape = at::native::shape_from_dim_mask(self, mask, keepdim);
+  auto shape = at::native::shape_from_dim_mask(self, std::move(mask), keepdim);
  auto options = self.options()
                     .dtype(toRealValueType(opt_dtype.value_or(self.scalar_type())));

@ -641,7 +641,7 @@ namespace {
 Tensor linalg_matrix_power_impl(
    const Tensor& self,
    int64_t n,
-    const std::optional<Tensor>& _out) {
+    std::optional<Tensor> _out) {
  NoTF32Guard disable_tf32;
  auto out = _out.value_or(Tensor());

@ -1019,7 +1019,7 @@ Tensor multi_dot_impl(TensorList _tensors, std::optional<Tensor> _out) {
  Tensor result;

  if (_out.has_value()) {
-    const auto& out = *_out;
+    auto out = *_out;
    TORCH_CHECK(
        dtype == out.dtype(),
        "multi_dot(): expected out tensor to have dtype ",
--- a/aten/src/ATen/native/LossCTC.cpp
+++ b/aten/src/ATen/native/LossCTC.cpp
@ -493,7 +493,7 @@ Tensor get_clamped_target_length(
 // the gradient is implemented for _cudnn_ctc_loss (just in derivatives.yaml) and _ctc_loss and this function has automatic gradients
 // it also handles the reduction if desired
 template <typename LengthsType>
-Tensor ctc_loss_impl(const Tensor& log_probs_, const Tensor& targets, const LengthsType& input_lengths, const LengthsType& target_lengths, int64_t BLANK, int64_t reduction, bool zero_infinity) {
+Tensor ctc_loss_impl(const Tensor& log_probs_, const Tensor& targets, LengthsType input_lengths, LengthsType target_lengths, int64_t BLANK, int64_t reduction, bool zero_infinity) {
  auto is_batched = log_probs_.dim() == 3;
  Tensor log_probs = is_batched ? log_probs_ : log_probs_.unsqueeze(1);
  bool use_cudnn =
--- a/aten/src/ATen/native/MaxUnpooling.cpp
+++ b/aten/src/ATen/native/MaxUnpooling.cpp
@ -23,8 +23,6 @@ Tensor& max_unpooling2d_forward_out_cpu(
  // Nondeterministic with duplicate indices
  at::globalContext().alertNotDeterministic("max_unpooling2d_forward_out");

-  auto oheight = output_size[0];
-  auto owidth = output_size[1];
  TORCH_CHECK(
      indices_.scalar_type() == at::ScalarType::Long,
      "elements in indices should be type int64 but got: ", indices_.scalar_type());
@ -45,6 +43,9 @@ Tensor& max_unpooling2d_forward_out_cpu(
                self_.sizes(), " with dimension ", i , " being empty.");
  }

+  auto oheight = output_size[0];
+  auto owidth = output_size[1];
+
  auto memory_format = self_.suggest_memory_format();
  auto self = self_.contiguous(memory_format);
  auto indices = indices_.contiguous(memory_format);
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@ -599,7 +599,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, int64_t> _batch_norm_impl_index(
    check_dims_match_num_input_features("weight", num_features, weight.sym_numel());
  }
  if (bias.defined()) {
-    check_dims_match_num_input_features("bias", num_features, bias.sym_numel());
+    check_dims_match_num_input_features("bias", std::move(num_features), bias.sym_numel());
  }

  BatchNormBackend backend = _select_batch_norm_backend(input, weight, bias, running_mean, running_var, training, eps);
@ -923,7 +923,7 @@ std::tuple<Tensor, Tensor, Tensor> _batch_norm_legit_no_stats_cpu(
 std::tuple<Tensor, Tensor, Tensor> _batch_norm_legit_no_training(
    const Tensor& self, const std::optional<Tensor>& weight_opt, const std::optional<Tensor>& bias_opt,
    const Tensor& running_mean, const Tensor& running_var, double momentum, double eps) {
-  return at::_native_batch_norm_legit(self, weight_opt, bias_opt, const_cast<Tensor&>(running_mean), const_cast<Tensor&>(running_var), /*training=*/false, momentum, eps);
+  return at::_native_batch_norm_legit(self, weight_opt, bias_opt, const_cast<Tensor&>(running_mean), const_cast<Tensor&>(running_var), /*train=*/false, momentum, eps);
 }


--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@ -1533,7 +1533,7 @@ std::tuple<Tensor, Tensor> lstm_cell(
  check_rnn_cell_forward_input(input, w_ih.sym_size(1));
  auto hidden_size = w_hh.sym_size(1);
  check_rnn_cell_forward_hidden(input, hx[0], hidden_size, 0);
-  check_rnn_cell_forward_hidden(input, hx[1], hidden_size, 1);
+  check_rnn_cell_forward_hidden(input, hx[1], std::move(hidden_size), 1);
  static at::Tensor undefined;
  return LSTMCell<CellParams>{}(input, std::make_tuple(hx[0], hx[1]), CellParams{w_ih, w_hh, b_ih, b_hh, undefined});
 }
@ -1612,13 +1612,13 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor> _thnn_differentiable_gru_cell
    h_g = h_g + hidden_bias;
  }
  auto chunked_input_gates = in_g.unsafe_chunk(3, 1);
-  const Tensor& ir = chunked_input_gates[0];
-  const Tensor& ii = chunked_input_gates[1];
-  const Tensor& in = chunked_input_gates[2];
+  Tensor ir = chunked_input_gates[0];
+  Tensor ii = chunked_input_gates[1];
+  Tensor in = chunked_input_gates[2];
  auto chunked_hidden_gates = h_g.unsafe_chunk(3, 1);
-  const Tensor& hr = chunked_hidden_gates[0];
-  const Tensor& hi = chunked_hidden_gates[1];
-  const Tensor& hn = chunked_hidden_gates[2];
+  Tensor hr = chunked_hidden_gates[0];
+  Tensor hi = chunked_hidden_gates[1];
+  Tensor hn = chunked_hidden_gates[2];
  Tensor rg = (ir + hr).sigmoid();
  Tensor ig = (ii + hi).sigmoid();
  Tensor grad_hx = grad_hy * ig;
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@ -409,17 +409,17 @@ static inline Tensor& unary_op_impl_out(Tensor& result, const Tensor& self, Stub
 }

 template <typename Stub, typename ...Args>
-static inline Tensor& unary_op_impl_float_out(Tensor& result, const Tensor& self, Stub& stub, Args&&... args) {
+static inline Tensor& unary_op_impl_float_out(Tensor& result, const Tensor& self, Stub& stub, Args... args) {
  auto iter = TensorIterator::unary_float_op(result, self);
-  stub(iter.device_type(), iter, std::forward<Args>(args)...);
+  stub(iter.device_type(), iter, args...);
  return result;
 }

 template <typename Stub, typename ...Args>
-static inline Tensor unary_op_impl_float(const Tensor& self, Stub& stub, Args&&... args) {
+static inline Tensor unary_op_impl_float(const Tensor& self, Stub& stub, Args... args) {
  Tensor result;
  auto iter = TensorIterator::unary_float_op(result, self);
-  stub(iter.device_type(), iter, std::forward<Args>(args)...);
+  stub(iter.device_type(), iter, args...);
  return iter.output();
 }

--- a/aten/src/ATen/native/Unique.cpp
+++ b/aten/src/ATen/native/Unique.cpp
@ -323,7 +323,7 @@ std::tuple<Tensor, Tensor, Tensor> unique_consecutive_cpu_template(

 template<class ForwardIt>
 ForwardIt _unique_dim_cpu_impl(ForwardIt first, ForwardIt last,
-  std::vector<int64_t>& indices, const Tensor& inverse_indices_vec, const Tensor& counts) {
+  std::vector<int64_t>& indices, Tensor inverse_indices_vec, Tensor counts) {
    if (first == last) {
      return last;
    }
--- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_deserialize.cpp
+++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_deserialize.cpp
@ -24,7 +24,7 @@ constexpr int64_t num_output_channels_index [[maybe_unused]] = 10;
 constexpr int64_t num_input_channels_index [[maybe_unused]] = 11;

 template <typename TENSOR_DTYPE, typename VEC_DTYPE>
-std::vector<VEC_DTYPE> unwrap_vector(const at::Tensor& tensor) {
+std::vector<VEC_DTYPE> unwrap_vector(at::Tensor tensor) {
  std::vector<VEC_DTYPE> vec(tensor.numel());
  TENSOR_DTYPE* tensor_data_ptr = tensor.data_ptr<TENSOR_DTYPE>();
  std::copy(tensor_data_ptr, tensor_data_ptr + tensor.numel(), vec.data());
@ -39,7 +39,7 @@ std::vector<VEC_DTYPE> unwrap_vector(const at::Tensor& tensor) {
 */
 void unpack_bcsr(
    int8_t* dst,
-    const ao::sparse::BCSR& bcsr,
+    ao::sparse::BCSR bcsr,
    const int64_t R,
    const int64_t C,
    const int64_t RB,
--- a/aten/src/ATen/native/cuda/CUDALoops.cuh
+++ b/aten/src/ATen/native/cuda/CUDALoops.cuh
@ -999,12 +999,41 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
      dtypes[i] = iter.dtype(i);
    }
    auto offset_calc = ::make_offset_calculator<traits::arity + 1>(iter);
+#ifdef USE_ROCM
+    constexpr int grp_sz = 128;
+    launch_legacy_kernel_manual_unroll<grp_sz, 4>(numel, [=] GPU_LAMBDA(int idx, bool unrl) {
+      if (unrl) {
+        auto offsets0 = offset_calc.get(idx);
+        auto offsets1 = offset_calc.get(idx + grp_sz);
+        auto offsets2 = offset_calc.get(idx + grp_sz * 2);
+        auto offsets3 = offset_calc.get(idx + grp_sz * 3);
+        void* out0 = data[0] + offsets0[0];
+        void* out1 = data[0] + offsets1[0];
+        void* out2 = data[0] + offsets2[0];
+        void* out3 = data[0] + offsets3[0];
+        arg0_t result0 = invoke(f, &data[1], &offsets0[1], &dtypes[1], 1);
+        arg0_t result1 = invoke(f, &data[1], &offsets1[1], &dtypes[1], 1);
+        arg0_t result2 = invoke(f, &data[1], &offsets2[1], &dtypes[1], 1);
+        arg0_t result3 = invoke(f, &data[1], &offsets3[1], &dtypes[1], 1);
+        c10::cast_and_store<arg0_t>(dtypes[0], out0, result0);
+        c10::cast_and_store<arg0_t>(dtypes[0], out1, result1);
+        c10::cast_and_store<arg0_t>(dtypes[0], out2, result2);
+        c10::cast_and_store<arg0_t>(dtypes[0], out3, result3);
+      } else {
+        auto offsets = offset_calc.get(idx);
+        void* out = data[0] + offsets[0];
+        arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1);
+        c10::cast_and_store<arg0_t>(dtypes[0], out, result);
+      }
+    });
+#else
    launch_legacy_kernel<128, 4>(numel, [=] GPU_LAMBDA(int idx) {
      auto offsets = offset_calc.get(idx);
      void* out = data[0] + offsets[0];
      arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1);
      c10::cast_and_store<arg0_t>(dtypes[0], out, result);
    });
+#endif
  }
 }

--- a/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpList.cu
@ -51,7 +51,7 @@ std::vector<Tensor> foreach_tensor_list_op(
      Op<opmath_t>(),
      alpha.to<opmath_t>());

-  return tensor_lists[2];
+  return std::move(tensor_lists[2]);
 }

 template <typename T, template <class> class Op>
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalar.cu
@ -45,7 +45,7 @@ std::vector<Tensor> foreach_binary_op(
          /* res_arg_index */ 1>(),
      Op<opmath_t>(),
      scalar.to<opmath_t>());
-  return tensor_lists[1];
+  return std::move(tensor_lists[1]);
 }

 template <typename T, template <class> class Op>
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarList.cu
@ -33,7 +33,7 @@ std::vector<Tensor> foreach_binary_op(
  }

  tensor_lists.emplace_back(tensors.vec());
-  tensor_lists.emplace_back(vec_res);
+  tensor_lists.emplace_back(std::move(vec_res));

  using opmath_t = at::opmath_type<T>;
  multi_tensor_apply<2, opmath_t>(
@ -46,7 +46,7 @@ std::vector<Tensor> foreach_binary_op(
          /* res_arg_index */ 1>(),

      Op<opmath_t>());
-  return tensor_lists[1];
+  return std::move(tensor_lists[1]);
 }

 template <typename T, template <class> class Op>
--- a/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu
+++ b/aten/src/ATen/native/cuda/ForeachBinaryOpScalarTensor.cu
@ -56,7 +56,7 @@ std::vector<Tensor> foreach_binary_op(
      Op<opmath_t>(),
      scalar.data_ptr<T>(),
      alpha.to<opmath_t>());
-  return tensor_lists[1];
+  return std::move(tensor_lists[1]);
 }

 template <typename T, template <class> class Op>
--- a/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachPointwiseOp.cu
@ -57,7 +57,7 @@ std::vector<Tensor> foreach_pointwise_op(
            scalar.to<opmath_t>());
      });

-  return tensor_lists[3];
+  return std::move(tensor_lists[3]);
 }

 template <template <class> class Op>
@ -160,7 +160,7 @@ std::vector<Tensor> foreach_pointwise_op(
            Op<opmath_t>());
      });

-  return tensor_lists[3];
+  return std::move(tensor_lists[3]);
 }

 #define FOREACH_POINTWISE_OP_SCALAR(NAME, OP)                           \
--- a/aten/src/ATen/native/cuda/ForeachTernaryOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachTernaryOp.cu
@ -37,7 +37,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_ternary_cuda(
    vec_res.emplace_back(at::native::empty_like(t));
  }
  std::vector<std::vector<at::Tensor>> tensor_lists{
-      tensors1.vec(), tensors2.vec(), tensors3.vec(), vec_res};
+      tensors1.vec(), tensors2.vec(), tensors3.vec(), std::move(vec_res)};

  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
      at::ScalarType::Half,
@ -56,7 +56,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_ternary_cuda(
            LerpFunctor<opmath_t>());
      });

-  return tensor_lists[3];
+  return std::move(tensor_lists[3]);
 }

 void foreach_tensor_lerp_ternary_cuda_(
@ -104,7 +104,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_list_cuda(
    vec_res.emplace_back(at::native::empty_like(t));
  }
  std::vector<std::vector<at::Tensor>> tensor_lists{
-      tensors1.vec(), tensors2.vec(), vec_res};
+      tensors1.vec(), tensors2.vec(), std::move(vec_res)};

  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
      at::ScalarType::Half,
@ -124,7 +124,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_list_cuda(
            weight.to<opmath_t>());
      });

-  return tensor_lists[2];
+  return std::move(tensor_lists[2]);
 }

 void foreach_tensor_lerp_list_cuda_(
@ -173,7 +173,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_scalarlist_cuda(
    vec_res.emplace_back(at::native::empty_like(t));
  }
  std::vector<std::vector<at::Tensor>> tensor_lists{
-      tensors1.vec(), tensors2.vec(), vec_res};
+      tensors1.vec(), tensors2.vec(), std::move(vec_res)};

  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
      at::ScalarType::Half,
@ -193,7 +193,7 @@ std::vector<at::Tensor> foreach_tensor_lerp_scalarlist_cuda(
            LerpFunctor<opmath_t>());
      });

-  return tensor_lists[2];
+  return std::move(tensor_lists[2]);
 }

 void foreach_tensor_lerp_scalarlist_cuda_(
--- a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
+++ b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu
@ -67,7 +67,7 @@ std::vector<Tensor> foreach_unary_op(TensorList tensors) {
          /* res_arg_index */ 1>(),
      Op<opmath_t>());

-  return tensor_lists[1];
+  return std::move(tensor_lists[1]);
 }

 template <typename scalar_t, template <class> class Op>
--- a/aten/src/ATen/native/cuda/MaxUnpooling.cu
+++ b/aten/src/ATen/native/cuda/MaxUnpooling.cu
@ -125,8 +125,6 @@ Tensor& max_unpooling2d_forward_out_cuda(const Tensor& self_,
  TORCH_CHECK(
      indices_.scalar_type() == at::ScalarType::Long,
      "elements in indices should be type int64 but got: ", indices_.scalar_type());
-  auto oheight = output_size[0];
-  auto owidth = output_size[1];

  TensorArg output_arg{output, "output", 1}, self_arg{self_, "self_", 2},
      indices_arg{indices_, "indices_", 3};
@ -149,6 +147,9 @@ Tensor& max_unpooling2d_forward_out_cuda(const Tensor& self_,
      output_size.size() == 2,
      "There should be exactly two elements (height, width) in output_size, but got ", output_size.size(), " elements.");

+  auto oheight = output_size[0];
+  auto owidth = output_size[1];
+
  int64_t dimw = 2;
  int64_t dimh = 1;
  int64_t numBatch = 1;
@ -217,9 +218,6 @@ static void max_unpooling3d_shape_check(
    IntArrayRef stride,
    IntArrayRef padding,
    const char *fn_name) {
-  int64_t oT = output_size[0];
-  int64_t oH = output_size[1];
-  int64_t oW = output_size[2];
  TORCH_CHECK(
      indices.scalar_type() == at::ScalarType::Long,
      "elements in indices should be type int64 but got: ", indices.scalar_type());
@ -250,6 +248,10 @@ static void max_unpooling3d_shape_check(
      "strides should be greater than zero, but got stride: ",
      stride);

+  int64_t oT = output_size[0];
+  int64_t oH = output_size[1];
+  int64_t oW = output_size[2];
+
  int dimw = 3;
  int dimh = 2;
  int dimt = 1;
@ -402,8 +404,6 @@ at::Tensor& max_unpooling2d_backward_out_cuda(const Tensor& grad_output_,
    const Tensor& indices_,
    IntArrayRef output_size,
    Tensor& grad_input) {
-  int64_t oheight = output_size[0];
-  int64_t owidth = output_size[1];
  TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
  TORCH_CHECK(
      indices_.scalar_type() == at::ScalarType::Long,
@ -426,6 +426,9 @@ at::Tensor& max_unpooling2d_backward_out_cuda(const Tensor& grad_output_,

  TORCH_CHECK(output_size.size() == 2, "output_size must have two elements, got size: ", output_size.size());

+  int64_t oheight = output_size[0];
+  int64_t owidth = output_size[1];
+
  int64_t nInputCols, nInputRows, nInputPlane;

  int dimw = 2;
@ -505,13 +508,14 @@ at::Tensor& max_unpooling3d_backward_out_cuda(const Tensor& grad_output_,
    IntArrayRef padding,
    Tensor& grad_input) {
  TORCH_CHECK(grad_input.is_contiguous(), "grad_input must be contiguous");
-  int64_t oT = output_size[0];
-  int64_t oH = output_size[1];
-  int64_t oW = output_size[2];

  max_unpooling3d_shape_check(
    self_, grad_output_, indices_, output_size, stride, padding, "max_unpooling3d_backward_out_cuda()");

+  int64_t oT = output_size[0];
+  int64_t oH = output_size[1];
+  int64_t oW = output_size[2];
+
  int batchSize = 0;
  int inputSlices = 0;
  int inputTime = 0;
--- a/aten/src/ATen/native/cuda/Nonzero.cu
+++ b/aten/src/ATen/native/cuda/Nonzero.cu
@ -300,8 +300,6 @@ void nonzero_static_cuda_out_impl(
    int64_t size,
    int64_t fill_value,
    Tensor& out) {
-#if defined(CUDA_VERSION) || defined(USE_ROCM)
-
  Tensor self_contiguous_ = self.contiguous();
  // see comment in nonzero_cuda_out_impl on reqs for out
  bool out_correct_size =
@ -377,9 +375,6 @@ void nonzero_static_cuda_out_impl(
  if (need_to_copy) {
    out.copy_(out_temp);
  }
-#else
-  TORCH_CHECK(false, "Nonzero_static is not supported for cuda <= 11.4");
-#endif
 }

 Tensor& nonzero_out_cuda(const Tensor& self, Tensor& out) {
--- a/aten/src/ATen/native/cuda/SpectralOps.cpp
+++ b/aten/src/ATen/native/cuda/SpectralOps.cpp
@ -221,22 +221,9 @@ static const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_
  std::optional<CuFFTConfig> uncached_plan;
  const CuFFTConfig * config = nullptr;

-  // Workaround for gh-63152, gh-58724
-  // Bluestein plans in CUDA 11.1 (cufft 10.3) cannot be re-used
  // Bluestein's algorithm is only used when a size has large prime factors,
  // sizes with only small prime factors can still be cached
-  bool use_caching = true;
-#ifdef CUFFT_VERSION
-  if constexpr (10300 <= CUFFT_VERSION && CUFFT_VERSION < 10400) {
-    // Only cache plans for transforms with small prime factors
-    use_caching = std::none_of(
-        signal_size.begin() + 1, signal_size.end(), [](int64_t dim_size) {
-      return has_large_prime_factor(dim_size);
-    });
-  }
-#endif
-
-  if (use_caching && plan_cache.max_size() > 0) {
+  if (plan_cache.max_size() > 0) {
    guard.lock();
    if (plan_cache.max_size() > 0) {  // check again after acquiring the lock
      config = &plan_cache.lookup(Params);
--- a/aten/src/ATen/native/layer_norm.h
+++ b/aten/src/ATen/native/layer_norm.h
@ -35,7 +35,7 @@ C10_ALWAYS_INLINE void _check_rms_norm_inputs_symint(
    std::stringstream ss;
    ss << "Given normalized_shape=" << normalized_shape
       << ", expected input with shape [*";
-    for (const auto& size : normalized_shape) {
+    for (auto size : normalized_shape) {
      ss << ", " << size;
    }
    ss << "], but got input of size" << input_shape;
--- a/aten/src/ATen/native/mps/operations/Convolution.mm
+++ b/aten/src/ATen/native/mps/operations/Convolution.mm
@ -198,7 +198,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,

    if (input_t.is_contiguous(memory_format) && output_t.is_contiguous(memory_format) && is_macOS_15_0_or_newer) {
      inputNDArray = getMPSNDArray(input_t, inputShape);
-      outputNDArray = getMPSNDArray(*output, outputShape);
+      outputNDArray = getMPSNDArray(output_t, outputShape);
    }

    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
@ -302,7 +302,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
      }
    }
    auto outputPlaceholder = outputNDArray ? Placeholder(cachedGraph->outputTensor_, outputNDArray)
-                                           : Placeholder(cachedGraph->outputTensor_, *output);
+                                           : Placeholder(cachedGraph->outputTensor_, output_t);

    NSMutableDictionary<MPSGraphTensor*, MPSGraphTensorData*>* feeds =
        [[[NSMutableDictionary alloc] initWithCapacity:3] autorelease];
@ -315,7 +315,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t_,
    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
  }

-  return *output;
+  return output_t;
 }

 Tensor _mps_convolution(const Tensor& input_t,
--- a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
+++ b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm
@ -20,6 +20,7 @@
 #include <ATen/ops/baddbmm_native.h>
 #include <ATen/ops/bmm_native.h>
 #include <ATen/ops/cholesky_native.h>
+#include <ATen/ops/eye_native.h>
 #include <ATen/ops/linalg_cholesky_ex_native.h>
 #include <ATen/ops/linalg_inv_ex_native.h>
 #include <ATen/ops/linalg_lu_factor_ex_native.h>
@ -496,26 +497,24 @@ static void linalg_inv_ex_out_mps_impl(const Tensor& A, bool check_errors, const
  using namespace mps;
  TORCH_CHECK(result.is_mps(), "Output tensor is not MPS");
  TORCH_CHECK(!A.is_complex(), "linalg_inv: not supported for complex types yet!");
-  using CachedGraph = MPSUnaryCachedGraph;

-  MPSStream* stream = getCurrentMPSStream();
  info.zero_();
-
  if (A.numel() == 0) {
    return;
  }

-  if (!result.is_contiguous()) {
-    result.unsafeGetTensorImpl()->empty_tensor_restride(MemoryFormat::Contiguous);
-  }
  auto A_sizes = A.sizes();
  int ndim = A.dim();

-  Tensor LU = empty_like(A);
-  Tensor identity = zeros_like(A);
+  Tensor LU = empty_like(A, MemoryFormat::Contiguous);
+  Tensor identity = eye(A.size(-2), A.size(-1), A.scalar_type(), A.options().layout(), A.device()).expand_as(A);
  Tensor pivots = empty({A_sizes.begin(), A_sizes.end() - 1}, A.options().dtype(kInt));
-  (ndim == 2 ? identity.diagonal() : identity.diagonal(0, -2, -1)).fill_(1);
-  linalg_solve_out_mps_impl(A, identity, true, check_errors, result, LU, pivots, info);
+  // need to do this to keep the strides of the result tensor
+  // mps's solve expects row major layout, while inductor
+  // expects result to be column major
+  Tensor tmp = empty_like(A, MemoryFormat::Contiguous);
+  linalg_solve_out_mps_impl(A, identity, true, check_errors, tmp, LU, pivots, info);
+  result.copy_(tmp);
 }

 static Tensor& mm_out_mps_impl(const Tensor& self, const Tensor& other, Tensor& output) {
--- a/aten/src/ATen/native/mps/operations/Pooling.mm
+++ b/aten/src/ATen/native/mps/operations/Pooling.mm
@ -519,6 +519,13 @@ static void max_unpool_out_mps_template(const Tensor& input,
                                        Tensor& output,
                                        const int32_t pooling_dims,
                                        const std::string& op_name) {
+  TORCH_CHECK(output_size_.size() == static_cast<size_t>(pooling_dims),
+              op_name,
+              "There should be exactly ",
+              pooling_dims,
+              " elements but got ",
+              output_size_.size());
+
  auto dims = input.dim();
  auto leading_dims = input.dim() - pooling_dims;

--- a/aten/src/ATen/native/nested/NestedTensorBinaryOps.cpp
+++ b/aten/src/ATen/native/nested/NestedTensorBinaryOps.cpp
@ -77,7 +77,7 @@ static Tensor NestedTensor_elementwise_Tensor(
    const Tensor& other,
    const std::string& op_name,
    bool supports_striding,
-    const Func& f) {
+    Func f) {
  Tensor self_contiguous = self;
  Tensor other_contiguous = other;
  // self is a scalar
@ -238,7 +238,7 @@ static Tensor& NestedTensor_elementwise__Tensor(
    Tensor& self,
    const Tensor& other,
    const std::string& op_name,
-    const Func& f) {
+    Func f) {
  // self is a scalar
  if (!self.is_nested() && self.dim() == 0 && self.numel() == 1) {
    auto other_impl = get_nested_tensor_impl(other);
--- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
+++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp
@ -149,7 +149,7 @@ Tensor MakeStridedQTensorCPU(
    const IntArrayRef& sizes,
    const IntArrayRef& strides,
    const TensorOptions& options,
-    const QuantizerPtr& quantizer) {
+    QuantizerPtr quantizer) {
  AT_ASSERT(options.device().is_cpu());
  at::native::check_size_nonnegative(sizes);
  auto* allocator = at::getCPUAllocator();
--- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
+++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.h
@ -37,7 +37,7 @@ struct TORCH_API PackedLinearWeight : public LinearPackedParamsBase {
        col_offsets(std::move(col_offsets)),
        w_scale(std::move(w_scale)),
        w_zp(std::move(w_zp)),
-        q_scheme(q_scheme) {}
+        q_scheme(std::move(q_scheme)) {}
  std::unique_ptr<fbgemm::PackBMatrix<int8_t>> w;
  std::optional<at::Tensor> bias_;
  std::vector<int32_t> col_offsets;
@ -316,7 +316,7 @@ Tensor MakeStridedQTensorCPU(
    const IntArrayRef& sizes,
    const IntArrayRef& strides,
    const TensorOptions& options,
-    const QuantizerPtr& quantizer);
+    QuantizerPtr quantizer);

 Tensor MakeEmptyAffineQuantizedChannelsLast3dTensor(
    int64_t N,
--- a/aten/src/ATen/quantized/QTensorImpl.cpp
+++ b/aten/src/ATen/quantized/QTensorImpl.cpp
@ -7,7 +7,7 @@ QTensorImpl::QTensorImpl(
    DispatchKeySet key_set,
    const caffe2::TypeMeta data_type,
    QuantizerPtr quantizer)
-    : TensorImpl(std::move(storage), key_set, data_type),
+    : TensorImpl(std::move(storage), std::move(key_set), data_type),
      quantizer_(std::move(quantizer)) {}

 QTensorImpl::QTensorImpl(
@ -16,7 +16,7 @@ QTensorImpl::QTensorImpl(
    DispatchKeySet key_set,
    const caffe2::TypeMeta data_type,
    QuantizerPtr quantizer)
-    : TensorImpl(type, std::move(storage), key_set, data_type),
+    : TensorImpl(type, std::move(storage), std::move(key_set), data_type),
      quantizer_(std::move(quantizer)) {}

 const char* QTensorImpl::tensorimpl_type_name() const {
--- a/aten/src/ATen/quantized/QTensorImpl.h
+++ b/aten/src/ATen/quantized/QTensorImpl.h
@ -4,8 +4,6 @@
 #include <c10/core/TensorImpl.h>
 #include <c10/util/Exception.h>

-#include <utility>
-
 namespace at {

 /**
@ -38,7 +36,7 @@ struct TORCH_API QTensorImpl : public c10::TensorImpl {
  }

  void set_quantizer_(QuantizerPtr quantizer) {
-    quantizer_ = std::move(quantizer);
+    quantizer_ = quantizer;
  }

  /**
--- a/aten/src/ATen/quantized/Quantizer.cpp
+++ b/aten/src/ATen/quantized/Quantizer.cpp
@ -107,7 +107,7 @@ static int64_t get_sub_byte_tensor_size(IntArrayRef sizes, size_t dtype_itemsize
 inline Tensor new_qtensor(
    IntArrayRef sizes,
    const TensorOptions& options,
-    const QuantizerPtr& quantizer) {
+    QuantizerPtr quantizer) {
  auto memory_format = options.memory_format_opt().value_or(MemoryFormat::Contiguous);
  auto device = options.device();
  at::Allocator* allocator = nullptr;
@ -338,7 +338,7 @@ Tensor from_blob_quantized_per_tensor_affine(
  const std::size_t datasize = size * itemsize;

  DataPtr data_ptr = InefficientStdFunctionContext::makeDataPtr(
-      data, std::move(deleter), options.device());
+      data, deleter, options.device());

  Storage storage{Storage::use_byte_size_t{}, datasize, std::move(data_ptr)};

@ -411,7 +411,7 @@ Tensor from_blob_quantized_per_channel_affine(
  const std::size_t datasize = size * itemsize;

  DataPtr data_ptr = InefficientStdFunctionContext::makeDataPtr(
-      data, std::move(deleter), options.device());
+      data, deleter, options.device());

  Storage storage{Storage::use_byte_size_t{}, datasize, std::move(data_ptr)};

--- a/aten/src/ATen/quantized/Quantizer.h
+++ b/aten/src/ATen/quantized/Quantizer.h
@ -196,8 +196,8 @@ struct TORCH_API PerChannelAffineFloatQParamsQuantizer : public PerChannelAffine
      Tensor zero_points,
      int64_t axis)
      : PerChannelAffineQuantizer(scalar_type,
-        std::move(scales),
-        std::move(zero_points),
+        scales,
+        zero_points,
        axis) {}

  QScheme qscheme() const override {
@ -246,7 +246,7 @@ TORCH_API QuantizerPtr make_unknown_quantizer(ScalarType scalar_type);
 TORCH_API Tensor new_qtensor(
    IntArrayRef sizes,
    const TensorOptions& options,
-    const QuantizerPtr& quantizer);
+    QuantizerPtr quantizer);

 TORCH_API void set_quantizer_(const Tensor& self, ConstQuantizerPtr quantizer);

--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@ -3269,7 +3269,7 @@ class C10_TensorImpl_Size_Check_Dummy_Class : private TensorImpl {
    is_le<sizeof(autograd_meta_),         16,  FieldNameEnum::autograd_meta_>();
    is_le<sizeof(extra_meta_),            16,  FieldNameEnum::extra_meta_>();
    are_equal<sizeof(version_counter_),    8,  FieldNameEnum::version_counter_>();
-    are_equal<sizeof(pyobj_slot_),         8,  FieldNameEnum::pyobj_slot_>();
+    are_equal<sizeof(pyobj_slot_),   16,  FieldNameEnum::pyobj_slot_>();
    are_equal<sizeof(sizes_and_strides_), 88,  FieldNameEnum::sizes_and_strides_>();
    are_equal<sizeof(storage_offset_),     8,  FieldNameEnum::storage_offset_>();
    are_equal<sizeof(numel_),              8,  FieldNameEnum::numel_>();
--- a/c10/core/impl/PyInterpreterHooks.h
+++ b/c10/core/impl/PyInterpreterHooks.h
@ -13,10 +13,11 @@ struct C10_API PyInterpreterHooksInterface {

  // Get the PyInterpreter instance
  // Stub implementation throws error when Python is not available
-  // We return nullptr rather than throwing an error since there are bits of c10
-  // that expect an empty PyObjectSlot when python is not available.
  virtual PyInterpreter* getPyInterpreter() const {
-    return nullptr;
+    TORCH_CHECK(
+        false,
+        "PyTorch was compiled without Python support. "
+        "Cannot access Python interpreter from C++.");
  }
 };

--- a/c10/core/impl/PyObjectSlot.cpp
+++ b/c10/core/impl/PyObjectSlot.cpp
@ -2,7 +2,7 @@

 namespace c10::impl {

-PyObjectSlot::PyObjectSlot() : pyobj_(nullptr) {}
+PyObjectSlot::PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}

 PyObjectSlot::~PyObjectSlot() {
  maybe_destroy_pyobj();
@ -10,9 +10,9 @@ PyObjectSlot::~PyObjectSlot() {

 void PyObjectSlot::maybe_destroy_pyobj() {
  if (owns_pyobj()) {
-    TORCH_INTERNAL_ASSERT(getGlobalPyInterpreter() != nullptr);
+    TORCH_INTERNAL_ASSERT(pyobj_interpreter_ != nullptr);
    TORCH_INTERNAL_ASSERT(pyobj_ != nullptr);
-    (*getGlobalPyInterpreter())
+    (*pyobj_interpreter_.load(std::memory_order_acquire))
        ->decref(_unchecked_untagged_pyobj(), /*has_pyobj_slot*/ true);
    // NB: this destructor can only be entered when there are no
    // references to this C++ object (obviously), NOR any references
@ -25,7 +25,7 @@ void PyObjectSlot::maybe_destroy_pyobj() {
 }

 PyInterpreter* PyObjectSlot::pyobj_interpreter() {
-  return getGlobalPyInterpreter();
+  return pyobj_interpreter_.load(std::memory_order_acquire);
 }

 PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
@ -35,7 +35,7 @@ PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
 }

 PyInterpreter& PyObjectSlot::load_pyobj_interpreter() const {
-  auto interpreter = getGlobalPyInterpreter();
+  auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
  if (interpreter) {
    return *interpreter;
  }
--- a/c10/core/impl/PyObjectSlot.h
+++ b/c10/core/impl/PyObjectSlot.h
@ -6,17 +6,10 @@
 #include <c10/util/python_stub.h>
 #include <optional>

+#include <atomic>
+
 namespace c10::impl {

-// Function pointer type for getting the global interpreter
-using GetPyInterpreterFn = PyInterpreter* (*)();
-
-// Global function pointer (set by csrc initialization)
-C10_API extern GetPyInterpreterFn g_get_pyinterpreter_fn;
-
-// Helper function to get the global interpreter
-C10_API PyInterpreter* getGlobalPyInterpreter();
-
 struct C10_API PyObjectSlot {
 public:
  PyObjectSlot();
@ -33,6 +26,8 @@ struct C10_API PyObjectSlot {
  // NB: THIS FUNCTION CAN RAISE AN EXCEPTION.  Make sure to clean up after
  // PyObject if necessary!
  void init_pyobj(PyObject* pyobj) {
+    pyobj_interpreter_.store(
+        getGlobalPyInterpreter(), std::memory_order_relaxed);
    pyobj_ = pyobj;
  }

@ -60,15 +55,18 @@ struct C10_API PyObjectSlot {

  // @todo alban: I'm not too sure what's going on here, we can probably delete
  // it but it's worthwhile making sure
-  std::optional<PyObject*> check_pyobj() const {
-    impl::PyInterpreter* interpreter = getGlobalPyInterpreter();
-    if (interpreter == nullptr || pyobj_ == nullptr) {
+  std::optional<PyObject*> check_pyobj(bool ignore_hermetic_tls = false) const {
+    impl::PyInterpreter* interpreter =
+        pyobj_interpreter_.load(std::memory_order_acquire);
+    if (interpreter == nullptr) {
      return std::nullopt;
    }
-    if (c10::impl::HermeticPyObjectTLS::get_state()) {
+
+    if (!ignore_hermetic_tls && c10::impl::HermeticPyObjectTLS::get_state()) {
      return std::nullopt;
+    } else {
+      return _unchecked_untagged_pyobj();
    }
-    return _unchecked_untagged_pyobj();
  }

  PyInterpreter& load_pyobj_interpreter() const;
@ -78,6 +76,30 @@ struct C10_API PyObjectSlot {
  void set_owns_pyobj(bool b);

 private:
+  // This field contains the interpreter tag for this object.  See
+  // Note [Python interpreter tag] for general context
+  //
+  // Note [Memory ordering on Python interpreter tag]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // What memory_order do we need when accessing this atomic?  We don't
+  // need a single total modification order (as provided by
+  // memory_order_seq_cst) as pyobj_interpreter_ is monotonic: it can only
+  // transition from -1 to some positive integer and never changes afterwards.
+  // Because there is only one modification, it trivially already has a total
+  // modification order (e.g., we don't need fences or locked instructions on
+  // x86)
+  //
+  // In fact, one could make a reasonable argument that relaxed reads are OK,
+  // due to the presence of external locking (GIL) to ensure that interactions
+  // with other data structures are still correctly synchronized, so that
+  // we fall in the "Single-Location Data Structures" case as described in
+  // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf
+  // However, on x86, it doesn't matter if I use acquire or relaxed on the load
+  // as I get the same assembly in both cases.  So I just use the more
+  // conservative acquire (which will impede compiler optimizations but I don't
+  // care)
+  std::atomic<PyInterpreter*> pyobj_interpreter_;
+
  // This field contains a reference to a PyObject representing this Tensor.
  // If pyobj is nullptr, when we transfer Tensor to Python, we allocate a new
  // PyObject for it and set this field.  This field does not have to be
--- a/c10/cuda/CUDAMallocAsyncAllocator.cpp
+++ b/c10/cuda/CUDAMallocAsyncAllocator.cpp
@ -14,7 +14,6 @@ namespace c10::cuda::CUDACachingAllocator::CudaMallocAsync {
 using namespace c10::CachingAllocator;
 using namespace c10::CachingDeviceAllocator;

-#if CUDA_VERSION >= 11040 || defined(USE_ROCM)
 // CUDA device allocator that uses cudaMallocAsync to implement
 // the same interface as CUDACachingAllocator.cpp.

@ -926,13 +925,4 @@ CUDAAllocator* allocator() {
  return &device_allocator;
 }

-#else
-// NOLINTNEXTLINE(misc-use-internal-linkage)
-CUDAAllocator* allocator() {
-  TORCH_CHECK(false, "Cannot use CudaMallocAsyncAllocator with cuda < 11.4.");
-  return nullptr;
-}
-
-#endif
-
 } // namespace c10::cuda::CUDACachingAllocator::CudaMallocAsync
--- a/c10/util/ExclusivelyOwnedTensorTraits.h
+++ b/c10/util/ExclusivelyOwnedTensorTraits.h
@ -35,26 +35,26 @@ struct ExclusivelyOwnedTensorTraits {
    // incremented.
    const bool isUndefined = toDestroy == UndefinedTensorImpl::singleton();
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-        toDestroy->refcount_ == 1 || (toDestroy->refcount_ == 0 && isUndefined),
+        toDestroy->refcount() == 1 ||
+            (toDestroy->refcount() == 0 && isUndefined),
        "ExclusivelyOwned<Tensor> destroyed with isUndefined ",
        isUndefined,
        " and refcount ",
-        toDestroy->refcount_,
+        toDestroy->refcount(),
        ", expected 1 or, if isUndefined, 0!");
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-        toDestroy->weakcount_ == 1 ||
-            (toDestroy->weakcount_ == 0 &&
+        toDestroy->weakcount() == 1 ||
+            (toDestroy->weakcount() == 0 &&
             toDestroy == UndefinedTensorImpl::singleton()),
        "ExclusivelyOwned<Tensor> destroyed with isUndefined ",
        isUndefined,
        " and weakcount ",
-        toDestroy->weakcount_,
+        toDestroy->weakcount(),
        ", expected 1 or, if isUndefined, 0!");
    if (!isUndefined) {
 #ifndef NDEBUG
      // Needed to pass the debug assertions in ~intrusive_ptr_target.
-      toDestroy->refcount_ = 0;
-      toDestroy->weakcount_ = 0;
+      toDestroy->combined_refcount_.store(0, std::memory_order_relaxed);
 #endif
      delete toDestroy;
    }
--- a/c10/util/intrusive_ptr.h
+++ b/c10/util/intrusive_ptr.h
@ -27,7 +27,78 @@ struct DontIncreaseRefcount {};
 } // namespace raw

 namespace detail {
-constexpr uint32_t kImpracticallyHugeReferenceCount = 0x0FFFFFFF;
+constexpr uint64_t kImpracticallyHugeReferenceCount = 0x0FFFFFFF;
+constexpr uint64_t kImpracticallyHugeWeakReferenceCount =
+    (kImpracticallyHugeReferenceCount << 32);
+constexpr uint64_t kReferenceCountOne = 1;
+constexpr uint64_t kWeakReferenceCountOne = (kReferenceCountOne << 32);
+constexpr uint64_t kUniqueRef = (kReferenceCountOne | kWeakReferenceCountOne);
+
+template <class TTarget>
+struct intrusive_target_default_null_type final {
+  static constexpr TTarget* singleton() noexcept {
+    return nullptr;
+  }
+};
+
+template <class TTarget, class ToNullType, class FromNullType>
+TTarget* assign_ptr_(TTarget* rhs) {
+  if (FromNullType::singleton() == rhs) {
+    return ToNullType::singleton();
+  } else {
+    return rhs;
+  }
+}
+
+inline uint32_t refcount(uint64_t combined_refcount) {
+  return static_cast<uint32_t>(combined_refcount);
+}
+
+inline uint32_t weakcount(uint64_t combined_refcount) {
+  return static_cast<uint32_t>(combined_refcount >> 32);
+}
+
+// The only requirement for refcount increment is that it happens-before
+// decrement, so no additional memory ordering is needed.
+inline uint64_t atomic_combined_refcount_increment(
+    std::atomic<uint64_t>& combined_refcount,
+    uint64_t inc) {
+  return combined_refcount.fetch_add(inc, std::memory_order_relaxed) + inc;
+}
+
+inline uint32_t atomic_refcount_increment(
+    std::atomic<uint64_t>& combined_refcount) {
+  return detail::refcount(atomic_combined_refcount_increment(
+      combined_refcount, kReferenceCountOne));
+}
+
+inline uint32_t atomic_weakcount_increment(
+    std::atomic<uint64_t>& combined_refcount) {
+  return detail::weakcount(atomic_combined_refcount_increment(
+      combined_refcount, kWeakReferenceCountOne));
+}
+
+// The requirement is that all modifications to the managed object happen-before
+// invocation of the managed object destructor, and that allocation of the
+// managed object storage happens-before deallocation of the storage.
+//
+// To get this ordering, all non-final decrements must synchronize-with the
+// final decrement. So all non-final decrements have to store-release while the
+// final decrement has to load-acquire, either directly or with the help of
+// fences. But it's easiest just to have all decrements be acq-rel. And it turns
+// out, on modern architectures and chips, it's also fastest.
+inline uint64_t atomic_combined_refcount_decrement(
+    std::atomic<uint64_t>& combined_refcount,
+    uint64_t dec) {
+  return combined_refcount.fetch_sub(dec, std::memory_order_acq_rel) - dec;
+}
+
+inline uint32_t atomic_weakcount_decrement(
+    std::atomic<uint64_t>& combined_refcount) {
+  return detail::weakcount(atomic_combined_refcount_decrement(
+      combined_refcount, kWeakReferenceCountOne));
+}
+
 } // namespace detail

 /**
@ -80,8 +151,14 @@ class C10_API intrusive_ptr_target {
  //    atomically increment the use count, if it is greater than 0.
  //    If it is not, you must report that the storage is dead.
  //
-  mutable std::atomic<uint32_t> refcount_;
-  mutable std::atomic<uint32_t> weakcount_;
+  //.We use a single combined count for refcount and weakcount so that
+  // we can atomically operate on both at the same time for performance
+  // and defined behaviors.
+  //
+  mutable std::atomic<uint64_t> combined_refcount_;
+  static_assert(sizeof(std::atomic<uint64_t>) == 8);
+  static_assert(alignof(std::atomic<uint64_t>) == 8);
+  static_assert(std::atomic<uint64_t>::is_always_lock_free);

  template <typename T, typename NullType>
  friend class intrusive_ptr;
@ -126,16 +203,16 @@ class C10_API intrusive_ptr_target {
        // caller of unsafe_adapt_non_heap_allocated wanted to
        // use). We choose our reference count such that the count
        // will not dip below kImpracticallyHugeReferenceCount regardless.
-        refcount_.load() == 0 ||
-            refcount_.load() >= detail::kImpracticallyHugeReferenceCount,
+        refcount() == 0 ||
+            refcount() >= detail::kImpracticallyHugeReferenceCount,
        "Tried to destruct an intrusive_ptr_target that still has intrusive_ptr to it; refcount was ",
-        refcount_.load());
+        refcount());
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
        // See ~intrusive_ptr for optimization that will frequently result in 1
        // at destruction time.
-        weakcount_.load() == 1 || weakcount_.load() == 0 ||
-            weakcount_.load() == detail::kImpracticallyHugeReferenceCount - 1 ||
-            weakcount_.load() == detail::kImpracticallyHugeReferenceCount,
+        weakcount() == 1 || weakcount() == 0 ||
+            weakcount() == detail::kImpracticallyHugeReferenceCount - 1 ||
+            weakcount() == detail::kImpracticallyHugeReferenceCount,
        "Tried to destruct an intrusive_ptr_target that still has weak_intrusive_ptr to it");
 #if defined(_MSC_VER) && !defined(__clang__)
 #pragma warning(pop)
@ -144,7 +221,7 @@ class C10_API intrusive_ptr_target {
 #endif
  }

-  constexpr intrusive_ptr_target() noexcept : refcount_(0), weakcount_(0) {}
+  constexpr intrusive_ptr_target() noexcept : combined_refcount_(0) {}

  // intrusive_ptr_target supports copy and move: but refcount and weakcount
  // don't participate (since they are intrinsic properties of the memory
@ -177,54 +254,17 @@ class C10_API intrusive_ptr_target {
   * destructed), this function WILL NOT be called.
   */
  virtual void release_resources() {}
-};

-namespace detail {
-template <class TTarget>
-struct intrusive_target_default_null_type final {
-  static constexpr TTarget* singleton() noexcept {
-    return nullptr;
+  uint32_t refcount(std::memory_order order = std::memory_order_relaxed) const {
+    return detail::refcount(combined_refcount_.load(order));
+  }
+
+  uint32_t weakcount(
+      std::memory_order order = std::memory_order_relaxed) const {
+    return detail::weakcount(combined_refcount_.load(order));
  }
 };

-template <class TTarget, class ToNullType, class FromNullType>
-TTarget* assign_ptr_(TTarget* rhs) {
-  if (FromNullType::singleton() == rhs) {
-    return ToNullType::singleton();
-  } else {
-    return rhs;
-  }
-}
-
-// The only requirement for refcount increment is that it happens-before
-// decrement, so no additional memory ordering is needed.
-inline uint32_t atomic_refcount_increment(std::atomic<uint32_t>& refcount) {
-  return refcount.fetch_add(1, std::memory_order_relaxed) + 1;
-}
-
-inline uint32_t atomic_weakcount_increment(std::atomic<uint32_t>& weakcount) {
-  return weakcount.fetch_add(1, std::memory_order_relaxed) + 1;
-}
-
-// The requirement is that all modifications to the managed object happen-before
-// invocation of the managed object destructor, and that allocation of the
-// managed object storage happens-before deallocation of the storage.
-//
-// To get this ordering, all non-final decrements must synchronize-with the
-// final decrement. So all non-final decrements have to store-release while the
-// final decrement has to load-acquire, either directly or with the help of
-// fences. But it's easiest just to have all decrements be acq-rel. And it turns
-// out, on modern architectures and chips, it's also fastest.
-inline uint32_t atomic_refcount_decrement(std::atomic<uint32_t>& refcount) {
-  return refcount.fetch_sub(1, std::memory_order_acq_rel) - 1;
-}
-
-inline uint32_t atomic_weakcount_decrement(std::atomic<uint32_t>& weakcount) {
-  return weakcount.fetch_sub(1, std::memory_order_acq_rel) - 1;
-}
-
-} // namespace detail
-
 template <class TTarget, class NullType>
 class weak_intrusive_ptr;

@ -275,7 +315,7 @@ class intrusive_ptr final {
  void retain_() {
    if (target_ != NullType::singleton()) {
      uint32_t new_refcount =
-          detail::atomic_refcount_increment(target_->refcount_);
+          detail::atomic_refcount_increment(target_->combined_refcount_);
      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
          new_refcount != 1,
          "intrusive_ptr: Cannot increase refcount after it reached zero.");
@ -284,41 +324,25 @@ class intrusive_ptr final {

  void reset_() noexcept {
    if (target_ != NullType::singleton()) {
-#if defined(__linux__) && (defined(__aarch64__) || defined(__x86_64__))
-      if constexpr (
-          std::atomic<uint64_t>::is_always_lock_free &&
-          std::atomic<uint32_t>::is_always_lock_free &&
-          sizeof(std::atomic<uint64_t>) == 8 &&
-          sizeof(std::atomic<uint32_t>) == 4) {
-        auto both_counts_ =
-            reinterpret_cast<std::atomic<uint64_t>*>(&target_->refcount_);
-        TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-            (reinterpret_cast<std::uintptr_t>(both_counts_) %
-             sizeof(std::atomic<uint64_t>)) == 0 &&
-            (reinterpret_cast<std::uintptr_t>(&target_->weakcount_) -
-             reinterpret_cast<std::uintptr_t>(both_counts_)) ==
-                sizeof(std::atomic<uint32_t>));
-        // 0x100000001ULL is a 64-bit number combination of both the refcount_
-        // and weakcount_ being 1.
-        constexpr uint64_t unique_ref_ = 0x100000001ULL;
-        if (both_counts_->load(std::memory_order_acquire) == unique_ref_) {
-          // Both counts are 1, so there are no weak references and
-          // we are releasing the last strong reference. No other
-          // threads can observe the effects of this target_ deletion
-          // call (e.g. calling use_count()) without a data race.
-          target_->refcount_.store(0, std::memory_order_relaxed);
-          delete target_;
-          return;
-        }
+      if (target_->combined_refcount_.load(std::memory_order_acquire) ==
+          detail::kUniqueRef) {
+        // Both counts are 1, so there are no weak references and
+        // we are releasing the last strong reference. No other
+        // threads can observe the effects of this target_ deletion
+        // call (e.g. calling use_count()) without a data race.
+        target_->combined_refcount_.store(0, std::memory_order_relaxed);
+        delete target_;
+        return;
      }
-#endif

-      if (detail::atomic_refcount_decrement(target_->refcount_) == 0) {
+      auto combined_refcount = detail::atomic_combined_refcount_decrement(
+          target_->combined_refcount_, detail::kReferenceCountOne);
+      if (detail::refcount(combined_refcount) == 0) {
+        bool should_delete =
+            (combined_refcount == detail::kWeakReferenceCountOne);
        // See comment above about weakcount. As long as refcount>0,
        // weakcount is one larger than the actual number of weak references.
        // So we need to decrement it here.
-        bool should_delete =
-            target_->weakcount_.load(std::memory_order_acquire) == 1;
        if (!should_delete) {
          // justification for const_cast: release_resources is basically a
          // destructor and a destructor always mutates the object, even for
@ -326,8 +350,8 @@ class intrusive_ptr final {
          // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
          const_cast<std::remove_const_t<TTarget>*>(target_)
              ->release_resources();
-          should_delete =
-              detail::atomic_weakcount_decrement(target_->weakcount_) == 0;
+          should_delete = detail::atomic_weakcount_decrement(
+                              target_->combined_refcount_) == 0;
        }
        if (should_delete) {
          delete target_;
@ -354,12 +378,12 @@ class intrusive_ptr final {
      // `mov`, whereas an atomic increment does a lock-prefixed `add`, which is
      // much more expensive: https://godbolt.org/z/eKPzj8.)
      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-          target_->refcount_ == 0 && target_->weakcount_ == 0,
+          target_->combined_refcount_.load(std::memory_order_relaxed) == 0,
          "intrusive_ptr: Newly-created target had non-zero refcounts. Does its "
          "constructor do something strange like incref or create an "
          "intrusive_ptr from `this`?");
-      target_->refcount_.store(1, std::memory_order_relaxed);
-      target_->weakcount_.store(1, std::memory_order_relaxed);
+      target_->combined_refcount_.store(
+          detail::kUniqueRef, std::memory_order_relaxed);
    }
  }

@ -482,14 +506,14 @@ class intrusive_ptr final {
    if (target_ == NullType::singleton()) {
      return 0;
    }
-    return target_->refcount_.load(std::memory_order_relaxed);
+    return target_->refcount(std::memory_order_relaxed);
  }

  uint32_t weak_use_count() const noexcept {
    if (target_ == NullType::singleton()) {
      return 0;
    }
-    return target_->weakcount_.load(std::memory_order_relaxed);
+    return target_->weakcount(std::memory_order_relaxed);
  }

  bool unique() const noexcept {
@ -518,8 +542,8 @@ class intrusive_ptr final {
   */
  static intrusive_ptr reclaim(TTarget* owning_ptr) {
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-        owning_ptr == NullType::singleton() ||
-            owning_ptr->refcount_.load() == 0 || owning_ptr->weakcount_.load(),
+        owning_ptr == NullType::singleton() || owning_ptr->refcount() == 0 ||
+            owning_ptr->weakcount(),
        "TTarget violates the invariant that refcount > 0  =>  weakcount > 0");
    return intrusive_ptr(owning_ptr, raw::DontIncreaseRefcount{});
  }
@ -590,11 +614,11 @@ class intrusive_ptr final {
 #ifdef NDEBUG
    expected_decrefs = 0;
 #endif
-    result.target_->refcount_.store(
-        detail::kImpracticallyHugeReferenceCount + expected_decrefs,
+    result.target_->combined_refcount_.store(
+        detail::refcount(
+            detail::kImpracticallyHugeReferenceCount + expected_decrefs) |
+            detail::kImpracticallyHugeWeakReferenceCount,
        std::memory_order_relaxed);
-    result.target_->weakcount_.store(
-        detail::kImpracticallyHugeReferenceCount, std::memory_order_relaxed);
    return result;
  }

@ -611,7 +635,7 @@ class intrusive_ptr final {
  static intrusive_ptr unsafe_reclaim_from_nonowning(TTarget* raw_ptr) {
    // See Note [Stack allocated intrusive_ptr_target safety]
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
-        raw_ptr == NullType::singleton() || raw_ptr->refcount_.load() > 0,
+        raw_ptr == NullType::singleton() || raw_ptr->refcount() > 0,
        "intrusive_ptr: Can only reclaim pointers that are owned by someone");
    auto ptr = reclaim(raw_ptr); // doesn't increase refcount
    ptr.retain_();
@ -745,7 +769,7 @@ class weak_intrusive_ptr final {
  void retain_() {
    if (target_ != NullType::singleton()) {
      uint32_t new_weakcount =
-          detail::atomic_weakcount_increment(target_->weakcount_);
+          detail::atomic_weakcount_increment(target_->combined_refcount_);
      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
          new_weakcount != 1,
          "weak_intrusive_ptr: Cannot increase weakcount after it reached zero.");
@ -754,7 +778,7 @@ class weak_intrusive_ptr final {

  void reset_() noexcept {
    if (target_ != NullType::singleton() &&
-        detail::atomic_weakcount_decrement(target_->weakcount_) == 0) {
+        detail::atomic_weakcount_decrement(target_->combined_refcount_) == 0) {
      // NOLINTNEXTLINE(clang-analyzer-cplusplus.NewDelete)
      delete target_;
    }
@ -887,7 +911,7 @@ class weak_intrusive_ptr final {
    if (target_ == NullType::singleton()) {
      return 0;
    }
-    return target_->refcount_.load(
+    return target_->refcount(
        std::memory_order_relaxed); // refcount, not weakcount!
  }

@ -895,7 +919,7 @@ class weak_intrusive_ptr final {
    if (target_ == NullType::singleton()) {
      return 0;
    }
-    return target_->weakcount_.load(std::memory_order_relaxed);
+    return target_->weakcount(std::memory_order_relaxed);
  }

  bool expired() const noexcept {
@ -906,16 +930,17 @@ class weak_intrusive_ptr final {
    if (target_ == NullType::singleton()) {
      return intrusive_ptr<TTarget, NullType>();
    } else {
-      auto refcount = target_->refcount_.load(std::memory_order_relaxed);
+      auto combined_refcount =
+          target_->combined_refcount_.load(std::memory_order_relaxed);
      do {
-        if (refcount == 0) {
+        if (detail::refcount(combined_refcount) == 0) {
          // Object already destructed, no strong references left anymore.
          // Return nullptr.
          return intrusive_ptr<TTarget, NullType>();
        }
-      } while (!target_->refcount_.compare_exchange_weak(
-          refcount,
-          refcount + 1,
+      } while (!target_->combined_refcount_.compare_exchange_weak(
+          combined_refcount,
+          combined_refcount + detail::kReferenceCountOne,
          std::memory_order_acquire,
          std::memory_order_relaxed));

@ -952,9 +977,9 @@ class weak_intrusive_ptr final {
    // if refcount == 0, weakcount only must be >0.
    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
        owning_weak_ptr == NullType::singleton() ||
-            owning_weak_ptr->weakcount_.load() > 1 ||
-            (owning_weak_ptr->refcount_.load() == 0 &&
-             owning_weak_ptr->weakcount_.load() > 0),
+            owning_weak_ptr->weakcount() > 1 ||
+            (owning_weak_ptr->refcount() == 0 &&
+             owning_weak_ptr->weakcount() > 0),
        "weak_intrusive_ptr: Can only weak_intrusive_ptr::reclaim() owning pointers that were created using weak_intrusive_ptr::release().");
    return weak_intrusive_ptr(owning_weak_ptr);
  }
@ -1033,7 +1058,7 @@ namespace intrusive_ptr {
 // NullType::singleton to this function
 inline void incref(intrusive_ptr_target* self) {
  if (self) {
-    detail::atomic_refcount_increment(self->refcount_);
+    detail::atomic_refcount_increment(self->combined_refcount_);
  }
 }

@ -1067,7 +1092,7 @@ inline uint32_t use_count(intrusive_ptr_target* self) {
 namespace weak_intrusive_ptr {

 inline void incref(weak_intrusive_ptr_target* self) {
-  detail::atomic_weakcount_increment(self->weakcount_);
+  detail::atomic_weakcount_increment(self->combined_refcount_);
 }

 inline void decref(weak_intrusive_ptr_target* self) {
--- a/caffe2/serialize/inline_container.cc
+++ b/caffe2/serialize/inline_container.cc
@ -396,8 +396,7 @@ size_t PyTorchStreamReader::getRecordMultiReaders(
  size_t perThreadSize = (n + nthread - 1) / nthread;
  std::vector<size_t> readSizes(nthread, 0);
  std::lock_guard<std::mutex> guard(reader_lock_);
-  loaderThreads.reserve(nthread);
-for (size_t i = 0; i < nthread; i++) {
+  for (size_t i = 0; i < nthread; i++) {
    loaderThreads.emplace_back([this,
                                name,
                                i,
@ -416,7 +415,7 @@ for (size_t i = 0; i < nthread; i++) {
          size =
              read(recordOff + startPos, (char*)dst + startPos, threadReadSize);
        } else {
-          const auto& reader = additionalReaders[i - 1];
+          auto reader = additionalReaders[i - 1];
          size = reader->read(
              recordOff + startPos, (char*)dst + startPos, threadReadSize);
        }
@ -642,7 +641,7 @@ size_t PyTorchStreamReader::getRecordSize(const std::string& name) {

 size_t PyTorchStreamReader::getRecordOffsetNoRead(
    size_t cursor,
-    const std::string& filename,
+    std::string filename,
    size_t size,
    uint64_t alignment) {
  std::string full_name = archive_name_plus_slash_ + filename;
@ -698,7 +697,7 @@ PyTorchStreamWriter::PyTorchStreamWriter(
 }

 PyTorchStreamWriter::PyTorchStreamWriter(
-    const std::function<size_t(const void*, size_t)>& writer_func,
+    const std::function<size_t(const void*, size_t)> writer_func,
    bool compute_crc32,
    uint64_t alignment)
    : archive_name_("archive"),
@ -713,7 +712,7 @@ void PyTorchStreamWriter::setup(const string& file_name) {
  memset(ar_.get(), 0, sizeof(mz_zip_archive));
  archive_name_plus_slash_ = archive_name_ + "/"; // for writeRecord().

-  if (archive_name_.empty()) {
+  if (archive_name_.size() == 0) {
    CAFFE_THROW("invalid file name: ", file_name);
  }

--- a/caffe2/serialize/inline_container.h
+++ b/caffe2/serialize/inline_container.h
@ -180,7 +180,7 @@ class TORCH_API PyTorchStreamReader final {
  size_t getRecordOffset(const std::string& name);
  size_t getRecordOffsetNoRead(
      size_t cursor,
-      const std::string& filename,
+      std::string filename,
      size_t size,
      uint64_t alignment);
  bool hasRecord(const std::string& name);
@ -232,7 +232,7 @@ class TORCH_API PyTorchStreamWriter final {
      bool compute_crc32 = true,
      uint64_t alignment = 64);
  explicit PyTorchStreamWriter(
-      const std::function<size_t(const void*, size_t)>& writer_func,
+      const std::function<size_t(const void*, size_t)> writer_func,
      bool compute_crc32 = true,
      uint64_t alignment = 64);

--- a/cmake/External/aotriton.cmake
+++ b/cmake/External/aotriton.cmake
@ -46,9 +46,10 @@ if(NOT __AOTRITON_INCLUDED)
  set(__AOTRITON_BASE_URL "https://github.com/ROCm/aotriton/releases/download/")  # @lint-ignore
  set(__AOTRITON_Z "gz")
  # Set the default __AOTRITON_LIB path
-  set(__AOTRITON_LIB "${__AOTRITON_INSTALL_DIR}/lib/libaotriton_v2.so")
-  if(WIN32)
-    set(__AOTRITON_LIB "${__AOTRITON_INSTALL_DIR}/lib/aotriton_v2.lib")
+  if(NOT WIN32)
+    set(__AOTRITON_LIB "lib/libaotriton_v2.so")
+  else()
+    set(__AOTRITON_LIB "lib/aotriton_v2.lib")
  endif()

  function(aotriton_build_windows_dependencies dlfcn-win32_external xz_external dlfcn-win32_DIR liblzma_DIR)
@ -143,8 +144,7 @@ if(NOT __AOTRITON_INCLUDED)
      -DHIP_PLATFORM=amd
      $<$<BOOL:${WIN32}>:-Ddlfcn-win32_DIR=${dlfcn-win32_DIR}>
      $<$<BOOL:${WIN32}>:-Dliblzma_DIR=${liblzma_DIR}>
-      BUILD_BYPRODUCTS
-        "${__AOTRITON_LIB}"
+      BUILD_BYPRODUCTS "${__AOTRITON_INSTALL_DIR}/${__AOTRITON_LIB}"
      USES_TERMINAL_DOWNLOAD TRUE
      USES_TERMINAL_CONFIGURE TRUE
      USES_TERMINAL_BUILD TRUE
@ -177,7 +177,7 @@ if(NOT __AOTRITON_INCLUDED)
      INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory
      "${CMAKE_CURRENT_BINARY_DIR}/aotriton_runtime"
      "${__AOTRITON_INSTALL_DIR}"
-      BUILD_BYPRODUCTS "${__AOTRITON_LIB}"
+      BUILD_BYPRODUCTS "${__AOTRITON_INSTALL_DIR}/${__AOTRITON_LIB}"
    )
    message(STATUS "Using AOTriton Runtime from pre-compiled binary ${__AOTRITON_URL}.\
    Set env variables AOTRITON_INSTALL_FROM_SOURCE=1 to build from source.")
@ -267,7 +267,7 @@ if(NOT __AOTRITON_INCLUDED)
      endforeach()
    endforeach()
  endif()
-  target_link_libraries(__caffe2_aotriton INTERFACE ${__AOTRITON_LIB})
+  target_link_libraries(__caffe2_aotriton INTERFACE "${__AOTRITON_INSTALL_DIR}/${__AOTRITON_LIB}")
  target_include_directories(__caffe2_aotriton INTERFACE ${__AOTRITON_INSTALL_DIR}/include)
  set(AOTRITON_FOUND TRUE)
 endif() # __AOTRITON_INCLUDED
--- a/docs/source/_static/img/dynamic_shapes/dynamic_shapes_example_specialization.png
+++ b/docs/source/_static/img/dynamic_shapes/dynamic_shapes_example_specialization.png
--- a/docs/source/_static/img/dynamic_shapes/tlparse10_debugging_guards_unbacked.png
+++ b/docs/source/_static/img/dynamic_shapes/tlparse10_debugging_guards_unbacked.png
--- a/docs/source/_static/img/dynamic_shapes/tlparse1_dynamic_shapes_false.png
+++ b/docs/source/_static/img/dynamic_shapes/tlparse1_dynamic_shapes_false.png
--- a/docs/source/_static/img/dynamic_shapes/tlparse2_dynamic_shapes_true.png
+++ b/docs/source/_static/img/dynamic_shapes/tlparse2_dynamic_shapes_true.png
--- a/docs/source/_static/img/dynamic_shapes/tlparse3_specialization.png
+++ b/docs/source/_static/img/dynamic_shapes/tlparse3_specialization.png
--- a/docs/source/_static/img/dynamic_shapes/tlparse4_pgo.png
+++ b/docs/source/_static/img/dynamic_shapes/tlparse4_pgo.png
--- a/docs/source/_static/img/dynamic_shapes/tlparse5_dynamic_shapes.png
+++ b/docs/source/_static/img/dynamic_shapes/tlparse5_dynamic_shapes.png
--- a/docs/source/_static/img/dynamic_shapes/tlparse6_size_related_recompilations.png
+++ b/docs/source/_static/img/dynamic_shapes/tlparse6_size_related_recompilations.png
--- a/docs/source/_static/img/dynamic_shapes/tlparse7_not_size_related_recompilations.png
+++ b/docs/source/_static/img/dynamic_shapes/tlparse7_not_size_related_recompilations.png
--- a/docs/source/_static/img/dynamic_shapes/tlparse8_compilation_metrics.png
+++ b/docs/source/_static/img/dynamic_shapes/tlparse8_compilation_metrics.png
--- a/docs/source/_static/img/dynamic_shapes/tlparse9_debugging_guards.png
+++ b/docs/source/_static/img/dynamic_shapes/tlparse9_debugging_guards.png
--- a/docs/source/compile/dynamic_shapes_advanced_control_options.md
+++ b/docs/source/compile/dynamic_shapes_advanced_control_options.md
@ -0,0 +1,239 @@
+(dynamic_shapes_advanced_control_options)=
+# Advanced Options to Control Dynamic Behavior
+
+PyTorch provides several advanced options to control dynamic behavior.
+These options requires a deep understanding of the PyTorch internals and
+may inlvolve setting additional tools. These options include:
+
+* Profile-Guided Optimization (PGO) is a technique that allows the compiler
+  to save automatic dynamic decisions and reuse them across jobs.
+* Compiler Collective is a feature that is used to modify automatic dynamic
+  shapes behavior by inferring if an input is dynamic based on whether
+  its size varies across ranks.
+
+## Profile-Guided Optimization (PGO)
+
+Profile-Guided Optimization (PGO) enhances automatic dynamic by sharing profiling decisions across runs of your model. Specifically, it serializes all the choices made by automatic dynamic into a file on disk. You can then copy this file—or store it in a centralized metadata service like S3—and reuse it on other machines to ensure consistent behavior across environments.
+
+For the purposes of the rest of this tutorial, you can use the following environmental variables to turn on PGO locally `TORCH_COMPILE_JOB_ID=1 TORCH_DYNAMO_AUTOMATIC_DYNAMIC_LOCAL_PGO=1`
+
+(identifying-dynamic-elements-marked-by-pgo)=
+### Identifying Dynamic Elements Marked by PGO
+
+Use `tlparse` to find line numbers of interest and check for multiple values
+seen for inputs.
+
+To determine which elements are marked as dynamic by Profile-Guided Optimization (PGO),
+follow these steps using `tlparse`:
+
+1. In the `tlparse` output, identify the line number of the frame of interest. Example:
+
+   ```{image} ../_static/img/dynamic_shapes/tlparse4_pgo.png
+   ```
+
+2. Open `local_code` using `put_local_code_state_` or `put_remote_code_state_` for the
+   latest frame (for example, 6/1).
+
+   Each `?` indicates that multiple values have been observed for this input.
+
+   For instance, the following output shows that the input `L['m']` has been seen with
+   multiple sizes at `size[0]`, but the stride has consistently been 1:
+
+   ```
+   /data/users/bobren/a/pytorch/r2.py:2:func:
+   L['m']: fully dynamic scalar or tensor
+   L['x']: tensor size=[?] stride=[1]
+   L['y']: tensor size=[?] stride=[1]
+   L['z']: tensor size=[?] stride=[1]
+   ```
+
+```{note}
+If an element is marked as dynamic by PGO, it does not guarantee that it will remain dynamic in the graph. Specialization can revert it to a static state.
+```
+
+## Compiler Collective
+
+Different ranks can communicate with each other to share observed sizes. In the second
+iteration, automatic dynamic uses this information to determine which elements to mark
+as dynamic based on inputs seen across all ranks. Check this [PR](https://github.com/pytorch/pytorch/pull/130935) for more details.
+To enable this feature, use `enable_compiler_collectives=True` with the `@config.patch`
+decorator.
+
+```python
+@config.patch(enable_compiler_collectives=True)
+```
+
+```{note}
+This feature enables the use of collectives during compilation to
+synchronize behavior across ranks. Currently, it is used to modify
+automatic dynamic shapes behavior by inferring if an input is dynamic
+based on whether its size varies across ranks. Since this synchronization
+uses collectives, all ranks must run compilation simultaneously; ranks must
+not diverge with graph breaks. This is most reliably achieved by ensuring
+torch is only run on SPMD programs. Violating this invariant may result in
+deadlocking NCCL and encountering a NCCL timeout.
+```
+
+## Reducing Compilations: Step by Step
+
+If you have a model that you can run on your master job and have a `tlparse`,
+here's whatyou should do next:
+
+### Step 1: Mark Dynamic Elements
+
+The first step is to reduce initial compilations that are eventually optimized away
+by automatic dynamic or PGO. This is straightforward because we know it will work
+upfront. If, in one run, a frame starts with static graphs and converges to
+dynamic graphs, and if you notice a reduction in the number of compiled
+frames in a second (warm) PGO-enabled run, it's likely due to this optimization.
+
+This is a two-step process:
+
+1. Find elements marked as dynamic by PGO or automatic dynamic.
+2. Mark them as dynamic using one of the {ref}`user_annotations`.
+
+#### How to Identify Elements to Mark as Dynamic
+
+Follow these guidelines:
+
+1. **PGO artifact:** Follow the steps in {ref}`identifying-dynamic-elements-marked-by-pgo`.
+2. **Dynamic Logs:** If you have a run with `TORCH_LOGS="+dynamic"`, each
+time a new dynamic dimension is allocated, a debug line will specify it
+along with the input name.
+3. **Compare Graphs:** For frames with reduced compilations across runs,
+inspect the Dynamo graphs in the second run or the latest runs in the
+cold run. Look for elements marked as dynamic in those graphs. Specifically,
+find graphs that are similar (once specialized and once dynamic).
+
+Even without a warm run, you can inspect all graphs for a specific frame
+to see if some are similar and converge to a dynamic version.
+
+For example, in the following `tlparse` snapshot, Dynamo graphs 20/0,
+20/1, and 20/2 are similar except for different sizes (for example,
+graph 20/0 vs. graph 20/2). In the Dynamo graph of 20/2, sizes `s0`,
+`s1`, and `s5` are used for `rotary_pos_emb_` and `x`.
+
+```{image} ../_static/img/dynamic_shapes/tlparse5_dynamic_shapes.png
+```
+
+```{tip}
+Two graphs are considered similar if they have the same sequence of calls for
+torch operations and the same tensor inputs. Variations may exist in integer
+inputs that could be inlined in the specialized version or arithmetic
+computations that only exist in the dynamic version due to inlining in the
+static version.
+```
+
+### Step 2: Debugging: Identifying Missed Opportunities
+
+The complexity of debugging can vary greatly depending on the issues you
+encounter. The end result is often to find a bug, enable a flag, or modify
+user/framework code.
+
+#### Finding Similar Graphs
+
+Start by identifying a group of similar graphs that you might want to combine
+into one dynamic graph, as discussed in the previous section on comparing
+graphs. If you can't find any similar graphs, there's nothing further to do
+in this step.
+
+#### Quick Checks: Fail Fast
+
+After finding similar graphs, you want to understand why the have recompilations.
+Check the following:
+
+1. **Check Recompile Reasons:** For graphs you believe are similar, click on
+`recompile_reason` in the `tlparse` output for the later graph. Ensure the
+reason is size-related and not due to other factors. For example, while
+in these screenshot the recomplile reason is size-related:
+
+```{image} ../_static/img/dynamic_shapes/tlparse6_size_related_recompilations.png
+```
+
+In the one below it is not, which indicates that dynamic shapes won't resolve it:
+
+```{image} ../_static/img/dynamic_shapes/tlparse7_not_size_related_recompilations.png
+:width: 500px
+:align: center
+```
+
+2. **Compare Guards Files:** Ensure there are no guards on non-size-related
+elementsthat exist in one graph but not the others.
+
+3. **Early Check for Custom Triton Kernels:** Check if your model calls custom
+Triton kernels with `tl.constexpr` arguments, as these are always
+specialized. If your model receives different values for these arguments,
+it could be a source of recompilation.
+
+
+## **Identifying and Fixing Recompilation Causes**
+
+1. **Is Something Not Marked Dynamic but Should Be?** Determine if an input was
+marked dynamic and got specialized or was not marked dynamic at all. You can
+identify this by:
+
+    * Checking the Dynamo graph - look for `Sym(number)`. For example:
+
+      ```
+      Sym(256) vs Sym(s0)
+      ```
+
+    * Using dynamic logs:
+
+      ```
+      ["TORCH_LOGS=+dynamic"]
+      create_symbol s2 = 2 for L['self']._modules['cle ...
+      ```
+
+    * Reviewing guards files. If a tensor size is dynamic, it will be indicated as `None`:
+
+      ```
+      TENSOR_MATCH:check_tensor(L['self'].x._parameters['weight']], Parameter, DispatchKeySet(CPU, BackendSelect, ADInplaceOrView, AutogradCPU), torch.float32, device=None, requires_grad=True, size=[None, None], stride=[None, 1])
+      ```
+
+2. **Why Is It Not Marked Dynamic?** If you determine an element is not marked dynamic, consider:
+
+    * Checking if it's an `nn` module property, parameter, or field. Verify setting for the flags:
+      * `force_parameter_static_shapes = True`
+      * `force_nn_module_property_static_shapes = True`
+      * `allow_unspec_int_on_nn_module = False`
+      * Or using the dynamic allow list to mark it dynamic, which should have the highest priority.
+
+    ```{tip}
+    Marking elements one by one can be time-consuming. Initially, flip the flags to
+    identify any blocking specializations, then decide how to mark them
+    dynamic at the end of the process.
+    ```
+
+    * If you feel, like it could be a bug, please file a bug report and mark
+    with the `module: dynamic shapes` label. Check the list of known issues in
+    [this list](https://github.com/pytorch/pytorch/issues?q=sort%3Aupdated-desc+state%3Aopen+label%3A%22module%3A+dynamic+shapes%22).
+
+3. **Is a Dynamic Element Getting Specialized?** Determine why it is specialized.
+It could be due to user code (such as an `if` condition), framework code, or a
+call  to a Triton kernel. To identify the reason for specialization:
+
+    * **Using tlparse:** Check the `compilation_metrics` for a specialization section, which will indicate what got specialized and the user and framework stack when it happened. Example:
+
+    ```{image} ../_static/img/dynamic_shapes/tlparse8_compilation_metrics.png
+    ```
+
+    The log above indicates that `s0` is specialized to `33` due to the following code:
+
+    ```
+    `if self.x ==33` at example4.py line 16.
+    ```
+
+    * **+Dynamic Logs:** pass `["TORCH_LOGS=+dynamic"]`. Look for the first specialization, as once a variable is specialized, all dependent variables get specialized too.
+
+    Example log:
+
+    ```
+    torch/fx/experimental/symbolic_shapes.py:6557] [0/2] eval Eq(s0, 33) [guard added] if self.x ==33:  # example4.py:16 in forward (_dynamo/variables/tensor.py:1242 in evaluate_expr), for more info run with TORCHDYNAMO_EXTENDED_DEBUG_GUARD_ADDED="Eq(s0, 33)"
+    V0228 12:04:24.190000 2990033 torch/fx/experimental/symbolic_shapes.py:6000] [0/2] _update_var_to_range s0 = VR[33, 33] (update)
+    ```
+
+    The log above indicates that `s0` is specialized to `33` due to the following code:
+    ```
+    if self.x ==33. At example4.py like 16.
+    ```
--- a/docs/source/compile/dynamic_shapes_backed_unbacked.md
+++ b/docs/source/compile/dynamic_shapes_backed_unbacked.md
@ -0,0 +1,45 @@
+(backed-vs-unbacked-symints)=
+# Backed vs Unbacked Symints
+
+Backed `SymInts` are symbolic integers that have a concrete value or "hint"
+associated with them. This means that torch can use these values to make
+decisions about control flow, such as determining which branch of code
+to execute. They are typically derived from operations where the size or
+value is known or can be inferred.
+
+Unbacked `SymInts` are symbolic integers that do not have a concrete value or
+hint. They often arise from data-dependent operations, such as `.nonzero()`
+or `.item()`, where the size or value cannot be determined at compile time.
+Since they lack a concrete value, they cannot be used for control flow
+decisions, and attempting to do so requires a graph break.
+
+Unbacked `SymInts` use *oblivious-size reasoning* which is particularly
+useful when you are dealing with
+{ref}`0/1 specialization recompilation problem <zero-one-specialization>`.
+
+In summary, backed `SymInts` have known values that can be used for
+decision-making, while unbacked `SymInts` do not, requiring special handling
+to avoid graph breaks.
+
+Unbacked symbolic integers can be too restrictive, causing most PyTorch programs
+to fail. To address this, you can use the following methods and APIs as
+workaround:
+
+* Use higher-level APIs like `empty` instead of `empty_strided` to create tensors.
+This ensures the tensor is non-overlapping and dense, avoiding unnecessary stride
+sorting and guard creation.to avoid unnecessary recomputation of these properties.
+
+* Modify your code to make precomputed properties *lazy*. This ensures that
+guards on unbacked symbolic integers are only applied when necessary,
+reducing computational overhead.
+
+## How to use unbacked
+To use unbacked APIs, replace `mark_dynamic` with `mark_unbacked` and
+`TORCH_COMPILE_DYNAMIC_SOURCES` with `TORCH_COMPILE_UNBACKED_SOURCES`.
+This tells the compiler to treat an input as unbacked.
+
+```{seealso}
+* {ref}`dynamic_shapes`
+* {ref}`torch.export`
+* {ref}`what_is_a_specialization`
+```
--- a/docs/source/compile/dynamic_shapes_beyond_the_basics.md
+++ b/docs/source/compile/dynamic_shapes_beyond_the_basics.md
@ -0,0 +1,10 @@
+(dynamic_shapes_beyond_the_basics)=
+# Beyond the Basics
+
+This section covers some advanced topics related to dynamic shapes. This includes more complex explanations of how dynamic shapes work, 0/1 specialization problems, and so on.
+
+```{toctree}
+:maxdepth: 1
+dynamic_shapes_zero_one_specialization
+dynamic_shapes_backed_unbacked
+```
--- a/docs/source/compile/dynamic_shapes_core_concepts.md
+++ b/docs/source/compile/dynamic_shapes_core_concepts.md
@ -0,0 +1,134 @@
+(dynamic_shapes_core_concepts)=
+# Dynamic Shapes Core Concepts
+
+This section described the core concepts of dynamic shapes in PyTorch. It is intended to be a
+reference for engineers working on the PyTorch compiler stack and anyone who wants to understand
+the inner workings of dynamic shapes.
+
+## Symbolic integers
+Symbolic integers (Symints) are used to represent variables that can span a range. For example:
+```python
+x = torch.randn(5, 5) # this tensor has a shape [5, 5]
+torch._dynamo.decorators.mark_dynamic(x, 0)
+x = torch.randn(5, 5) # this tensor has a shape [s0, 5]
+y = torch.cat([x, x], dim=0) # this tensor has a shape [2*s0, 5]
+```
+
+However, `z = x * y` would throw an error since we know that pointwise operation like multiply must
+operate on same sized tensors but we know statically `s0 != 2 * s0`. Astute readers may point out
+that this is not true when `s0 == 0` and the reason why that doesn't matter here is described in
+{ref}`zero-one-specialization`.
+
+## Guards
+
+In `torch.compile`, a guard is a mechanism that is used to ensure the validity of a compiled code graph.
+By default, when you make a variable dynamic, it can range from `[-inf, inf]`. For example:
+
+```python
+def foo(x): return x / 2
+
+This works for any dynamic x. But if your code is:
+
+def foo(x)
+    if x > 5:
+        return x / 2
+    return x / 3
+```
+If you call `foo(6)`, it returns `x / 2` and adds a guard `x > 5`. Calling `foo(4)` later will
+require recompilation because the guard is broken.
+
+## Runtime Asserts
+You can use runtime asserts to provide hints when you know certain facts, like batch size being less than 100:
+
+```python
+def foo(batch_size):
+    torch._check(batch_size < 100)
+    if batch_size < 100:
+        return do_something
+    return do_something_else()
+```
+
+## "Hint" Value
+
+A "hint value" in the context of `torch.compile` refers to the actual values known during the compilation process that help the JIT compiler make decisions about expressions. Hint values are particularly useful for handling dynamic shapes, as they provide concrete information that guides the compilation without requiring recompilation for varying dimensions.
+
+
+## Dynamic Behavior Overview
+
+PyTorch assumes static shapes by default. When a size change is detected, it attempts to
+recompile with dynamic input, although this may fail if there are conditional branches
+or missing support for dynamic shapes. To diagnose overspecialization, you can set
+`TORCH_LOGS=dynamic` to view "eval" entries that indicate when and why guards are added.
+
+If you anticipate a dimension will be dynamic, you can use `torch._dynamo.mark_dynamic(tensor, dim)`
+to mark it in advance, specifying `min` and `max` values if known. Using `torch.compile(dynamic=False)`
+disables automatic dynamic shapes, leading to recompilation for each unique size. Conversely,
+`torch.compile(dynamic=True)` aims to use dynamic shapes as much as possible which is most useful
+for small and may not be suitable for large models due to potential crashes or performance issues.
+
+You can whitelist specific sources to be marked as dynamic using the `TORCH_COMPILE_DYNAMIC_SOURCES` environment variable or `torch.compiler.config.dynamic_sources`. This is particularly useful for large
+models with graph breaks, as you can maintain dynamism across graph breaks since
+source names stay consistent. You can also use this to mark integers as dynamic. The format is a comma-delimited list of source names, for example, `"L['x'], L['y']"`.
+You can also use regexes, for example, `"L\['x.*'\], L\['y.*'\]")`.
+This whitelist takes precedence over other flags like `dynamic=False` `force_nn_module_property_static_shapes`, and `force_parameter_static_shapes`.
+
+Sometimes it can be cumbersome to find the right inputs to mark as dynamic. If
+you're willing to take a performance hit for the first batch, one other affordable
+option we have are the `eager_then_compile` stances which derive dynamism for you.
+See {func}`torch.compiler.set_stance` for more details.
+
+
+## Overall Architecture
+
+Symbolic shapes workflow:
+
+1. When compiling a frame in Dynamo, we allocate a `ShapeEnv` (attached to `FakeTensorMode`) to
+track symbolic shapes.
+2. We allocate symbolic sizes for tensors on entry, based on policy decisions.
+3. We propagate symbolic sizes through operators, maintaining both FX IR for symbolic compute export
+and Sympy expressions for reasoning.
+4. We add guards based on conditionals during Dynamo tracing or Inductor optimization, induced from both Python and C++.
+5. Guards can simplify symbolic variables. For instance, asserting `s0 == 4` allows replacing all occurrences of `s0` with `4`.
+6. After tracing and optimizing, we install all guards with the compiled code, ensuring reusability only if all guards evaluate true.
+
+## Internal API Class Hierarchy
+
+### Python Classes
+
+- **`SymInt`/`SymFloat`/`SymBool`**: User-visible classes that simulate their `int`/`float`/`bool` counterparts. Adding two `SymInts` produces a new `SymInt` that symbolically tracks the integer addition.
+
+- **`SymNode`**: Internal structure (accessible via `symint.node`) that holds actual symbolic tracking information. `SymNode` is type-erased, making it convenient to represent mixed-type operations.
+
+- **`ShapeEnv`**: Per-compile context state that tracks all free symbols and guards accumulated so far. Every `SymNode` records its `ShapeEnv` (but not vice versa; `SymNodes` are only used if they participate in a guard).
+
+### C++ Equivalents
+
+- **`c10::SymInt`/`SymFloat`/`SymBool`**: User-visible classes that simulate `int`/`float`/`bool`
+- **`c10::SymNode`/`SymNodeImpl`**: Analogous to Python `SymNode`
+- **No C++ `ShapeEnv`**: For debugging ease, the entire symbolic reasoning apparatus remains in Python
+
+When writing code traceable with `make_fx`, it must handle `SymInt`/`SymFloat`/`SymBool` flowing through it.
+
+## Value Ranges and Constraints
+
+Symbolic variables maintain **value ranges** that specify the set of possible values. By default:
+- Size-like unbacked `SymInts` have value range `[0, Inf]`
+- Regular unbacked `SymInts` have value range `[-Inf, Inf]`
+
+When assertions are made (e.g., `torch._check(x == y)`), the system:
+1. Attempts to replace unbacked symbols with equivalent expressions
+2. Refines value ranges based on the assertion
+3. Remembers boolean expressions that are always true
+
+Important files:
+
+- C++ SymInt API: `c10/core/SymInt.h`, `SymFloat.h`, `SymBool.h`
+- Python SymInt API: `torch/__init__.py` (look for `SymInt/SymFloat/SymBool`)
+- C++ plumbing: `c10/core/SymNodeImpl.h`, `torch/csrc/utils/python_symnode.h`, `torch/csrc/jit/python/init.cpp`
+- Python infrastructure: `torch/fx/experimental/symbolic_shapes.py`
+- Other important files: `torch/_subclasses/fake_tensor.py`, `torch/_meta_registrations.py`, decomps, PrimTorch refs
+
+```{seealso}
+* {ref}`dynamic_shapes`
+* {ref}`dynamic_shapes_troubleshooting`
+```
--- a/docs/source/compile/dynamic_shapes_debugging_tlparse_torch_logs.md
+++ b/docs/source/compile/dynamic_shapes_debugging_tlparse_torch_logs.md
@ -0,0 +1,101 @@
+(debugging-tlparse-torch-logs)=
+# Debugging with `tlparse` and `TORCH_LOGS=dynamic`
+
+`tlparse` is a tool used for analyzing and understanding the compilation
+process in PyTorch, particularly when dealing with dynamic shapes. It helps
+identify where guards and specializations occur in your code.
+
+`TORCH_LOGS=dynamic` is an environment variable setting that enables detailed
+logging of dynamic shape operations, providing insights into how symbolic
+shapes are handled during execution.
+
+This section will guide you through using `tlparse` and `TORCH_LOGS=dynamic` to
+troubleshoot dynamic shape issues in your code, including debugging
+specialization, guards, and more.
+
+# Debugging Specialization
+
+In the following example, `x.shape[0]` is dynamic but becomes specialized due to multiplication:
+
+```python
+import torch
+
+@torch.compile
+def fn(x, y):
+    return x * y
+
+x = torch.randn(5)
+y = torch.randn(5)
+torch._dynamo.decorators.mark_dynamic(x, 0)
+
+fn(x, y)
+```
+
+By using `TORCH_LOGS=dynamic`, you can observe this specialization in the logs:
+
+```xml
+TORCH_LOGS=dynamic python tl.py
+I0721 11:10:00.950000 845259 torch/fx/experimental/symbolic_shapes.py:3776] [0/0] create_env
+I0721 11:10:01.030000 845259 torch/fx/experimental/symbolic_shapes.py:5117] [0/0] create_symbol s77 = 5 for L['x'].size()[0] [2, int_oo] return x * y  # tl.py:5 in fn (_dynamo/variables/builder.py:3466 in <lambda>), for more info run with TORCHDYNAMO_EXTENDED_DEBUG_CREATE_SYMBOL="s77" or to suppress this message run with TORCHDYNAMO_EXTENDED_ADVICE="0"
+I0721 11:10:01.038000 845259 torch/fx/experimental/symbolic_shapes.py:7211] [0/0] eval Eq(s77, 5) [guard added] return x * y  # tl.py:5 in fn (_subclasses/fake_impls.py:922 in infer_size), for more info run with TORCHDYNAMO_EXTENDED_DEBUG_GUARD_ADDED="Eq(s77, 5)"
+```
+
+The line `eval Eq(s77, 5) [guard added] return x * y # tl.py:5` indicates the specialization.
+
+## Debugging Guards
+
+Consider the following code, which may cause recompilations due to dynamic
+shapes:
+
+```python
+import torch
+
+@torch.compile
+def fn(x, y):
+    if x.shape[0] < 10:
+        return x * y
+
+x = torch.randn(5)
+y = torch.randn(5)
+torch._dynamo.decorators.mark_dynamic(x, 0)
+torch._dynamo.decorators.mark_dynamic(y, 0)
+
+fn(x, y)
+```
+
+To identify where dynamic shape guards originate, use `tlparse`. Here is an example tlparse output:
+
+```{image} ../_static/img/dynamic_shapes/tlparse9_debugging_guards.png
+```
+
+By clicking on the `dynamo_cpp_guards` link, you can view all guards from the compilation, including the symbolic shape guard `L['x'].size()[0] <= 9`.
+
+Astute readers will notice the 0/1 specialization where we guard on `L['x'].size()[0] >= 2`. By modifying the code to use unbacked symbols, this guard is removed:
+
+```python
+import torch
+
+@torch.compile
+def fn(x, y):
+    # Necessary runtime assert since we can't guard on unbacked
+    torch._check(x.shape[0] < 10)
+    if x.shape[0] < 10:
+        return x * y
+
+x = torch.randn(5)
+y = torch.randn(5)
+torch._dynamo.decorators.mark_unbacked(x, 0)
+torch._dynamo.decorators.mark_unbacked(y, 0)
+
+fn(x, y)
+```
+
+Now, this compiled region can be used for inputs of size 0 and 1:
+
+```{image} ../_static/img/dynamic_shapes/tlparse10_debugging_guards_unbacked.png
+```
+
+```{seealso}
+* {ref}`dynamic_shapes`
+* {ref}`troubleshooting_guardondatadependentsymnode_errors`
+```
--- a/docs/source/compile/dynamic_shapes_troubleshooting.md
+++ b/docs/source/compile/dynamic_shapes_troubleshooting.md
@ -0,0 +1,14 @@
+(dynamic_shapes_troubleshooting)=
+
+# Troubleshooting Dynamic Shapes
+
+This section contains a list of common issues that you may encounter when using
+dynamic shapes. The section describes how to use `TORCH_LOGS` and `tlparse` to
+debug the issues, as well as provides some general tips and tricks to help you
+resolve the issues.
+
+```{toctree}
+:maxdepth: 1
+dynamic_shapes_debugging_tlparse_torch_logs
+dynamic_shapes_troubleshooting_guardon_errors
+```
--- a/docs/source/compile/dynamic_shapes_troubleshooting_guardon_errors.md
+++ b/docs/source/compile/dynamic_shapes_troubleshooting_guardon_errors.md
@ -0,0 +1,411 @@
+(troubleshooting_guardondatadependentsymnode_errors)=
+
+# Troubleshooting GuardOnDataDependentSymNode Errors
+
+When working with PyTorch models that have data-dependent control flow (using functions
+like `item()`, `tolist()`, or `nonzero())`, you may encounter `GuardOnDataDependentSymNode` errors.
+This section explains what these errors are and how to fix them.
+
+## Common Error Pattern
+The following output shows the common error pattern `GuardOnDataDependentSymNode` errors:
+
+```sh
+torch.fx.experimental.symbolic_shapes.GuardOnDataDependentSymNode: Could not guard on data-dependent expression Eq(u2, -1) (unhinted: Eq(u2, -1)).  (Size-like symbols: none)
+
+Potential framework code culprit (scroll up for full backtrace):
+  File "/data/users/ezyang/a/pytorch/torch/_prims_common/__init__.py", line 855, in infer_size
+    if d == -1:
+
+For more information, run with TORCH_LOGS="dynamic"
+For extended logs when we create symbols, also add TORCHDYNAMO_EXTENDED_DEBUG_CREATE_SYMBOL="u2"
+If you suspect the guard was triggered from C++, add TORCHDYNAMO_EXTENDED_DEBUG_CPP=1
+For more debugging help, see https://docs.google.com/document/d/1HSuTTVvYH1pTew89Rtpeu84Ht3nQEFTYhAX3Ypa_xJs/edit?usp=sharing
+```
+
+## Root Cause
+
+These errors occur when PyTorch tries to convert a symbolic quantity (for example, `u2 == -1`)
+into a concrete value (such as, `False`) to make branching decisions. In a typical scenario,
+where data-dependent sizes are not involved, PyTorch can determine the concrete value at
+compile time and install a guard to ensure the compilation result remains valid. However,
+with data-dependent quantities, the true value is unknown at compile time, resulting in errors.
+
+You can often rewrite your model, by adding `torch._check` or `torch._check_is_size` to
+bypass these issues. This document aims to teach you how.
+
+## Debugging Tools
+
+Here is the list of some of the debugging tools available in PyTorch that you can use to troubleshoot these errors:
+
+* `TORCH_LOGS="dynamic"` - Shows detailed logs about symbolic operations
+* `TORCHDYNAMO_EXTENDED_DEBUG_CREATE_SYMBOL="u2"` - Provides extended logs for specific symbols
+* `TORCHDYNAMO_EXTENDED_DEBUG_CPP=1` - Helps when guards are triggered from C++
+
+## Error Variations
+
+Here is a the list of error variations that you might encounter:
+
+| Error Variations | Description |
+|------------------|-------------|
+| "Could not guard on data-dependent expression" | Occurs when trying to extract a concrete boolean from expressions like u0 == 0 or u0 > 10 |
+| "Could not extract specialized integer from data-dependent expression" | Occurs when trying to extract a concrete integer value. <br/> **Common causes:** <br/> - Control flow that depends on the integer (such as, looping `u0` times) <br/> - Overspecialization in code that could work symbolically |
+
+## How to Diagnose Your Problem
+
+### Step 1: Examine the Potential Framework Culprit (Python Backtrace)
+
+The exception provides a backtrace, which often indicates the problem.
+Given that PT2 backtraces can be lengthy, the error message will also
+suggest a potential framework culprit. For example:
+
+```sh
+Potential framework code culprit (scroll up for full backtrace):
+  File "/data/users/ezyang/a/pytorch/torch/_prims_common/__init__.py", line 855, in infer_size
+    if d == -1:
+```
+
+**Consider the Following:**
+
+* Does it make sense that this condition is triggering a guard on a
+data-dependent symbol?
+* Should we know if the quantity in question is size-like?
+(The exception lists size-like symbols; if a symbol is not listed,
+it might be an arbitrary integer.)
+* If the equation involves two distinct symbols, should we know
+they are actually equal?
+*  If all symbols are size-like but the equation involves 0 or 1,
+are we missing a `guard_size_oblivious` wrapper? (Remember, for
+`guard_size_oblivious` between two size tuples, use `sym_eq` instead
+of regular equality.)
+
+In the example above, testing if `d` (a data-dependent value) is `-1` suggests
+that `d` should be non-negative if it were a size. This indicates a missing
+`torch._check_is_size`. If `d` is already size-like but `numel() == 0` fails,
+consider wrapping it in `guard_size_oblivious`.
+
+Using `TORCH_LOGS=dynamic` and examining the user stack trace is crucial for
+understanding how to fix the problem, as they guide you on how to modify the
+user program.
+
+```sh
+[INFO] create_unbacked_symint u0 [-9223372036854775808, 9223372036854775807] (w.py:40 in custom_op_meta)
+```
+
+This log message indicates where (`w.py:40`) the unbacked `SymInt` was
+allocated. An unbacked `SymInt` may be allocated multiple times, so track
+their equalities:
+
+```sh
+[INFO] set_replacement u1 = u0 (trivial_lhs) ValueRanges(lower=0, upper=9223372036854775807, is_bool=False)
+```
+
+### Step 2: Examine the C++ Backtrace
+
+If the framework code culprit is uninformative, the guard might be in C++. You can
+force a C++ backtrace by running with `TORCHDYNAMO_EXTENDED_DEBUG_CPP=1`. This
+provides a detailed C++ backtrace with Python, CPython, and C10/ATen/libtorch
+frames interspersed. Look for symbols in the `at::` or `c10::` namespace that
+resemble kernel-specific code, likely related to the kernel executed per the Python
+backtrace. If using a non-debug build of PyTorch, inlining may cause missing
+frames, requiring source code investigation to locate the issue. For example, see https://github.com/pytorch/pytorch/pull/118579.
+
+Here is an example C++ backtrace from a debugging session:
+
+```
+[2024-02-08 08:20:45,259] torch.fx.experimental.symbolic_shapes: [INFO]   File "../
+__gen_aten__/out/RegisterCompositeImplicitAutograd.cpp", line 2025, in at::
+(anonymous namespace)::(anonymous namespace)
+::wrapper_CompositeImplicitAutograd_Tensor_narrow(at::Tensor const&, long,
+at::Tensor const&, c10::SymInt) [2024-02-08 08:20:45,259] torch.fx.experimental.
+symbolic_shapes: [INFO]   File "../aten/src/ATen/native/TensorShape.cpp", line 1410,
+in at::native::narrow_tensor_symint(at::Tensor const&, long, at::Tensor const&,
+c10::SymInt) [2024-02-08 08:20:45,259] torch.fx.experimental.symbolic_shapes:
+[INFO]   File "../__gen_aten__/out/core/TensorMethods.cpp", line 52, in long
+at::Tensor::item<long>() const [2024-02-08 08:20:45,259] torch.fx.experimental.
+symbolic_shapes: [INFO]   File "../ATen/core/TensorBody.h", line 4274, in
+at::Tensor::item() const
+```
+
+In this example, `at::native::narrow_tensor_symint` calls into `item`, which
+triggers the guard on a data-dependent `SymNode`. You can modify the C++ code to
+avoid specializing, or verify if you should be in this C++ code (e.g., `start` was
+not expected to be a `Tensor`, and modifying this fixed the problem).
+
+## Tools for Fixing Errors
+
+There are a few important functions which you should use to troubleshoot this problem.
+
+### torch._check(cond, msg_fn)
+
+`torch._check` is a function used to assert conditions at runtime, particularly when dealing with symbolic integers (`SymInts`) in PyTorch.
+
+**Example Usage:**
+
+```python
+torch._check(x.size(0) == y, lambda: f"size mismatch: {x.size(0)} != {y}")
+```
+
+The code above does the following:
+
+* Creates a deferred runtime assertion instead of a compile-time guard
+* Teaches the symbolic reasoning system facts about your unbacked SymInts
+* Can eliminate unbacked symbols by replacing them with equivalent expressions
+* Refines value ranges of symbols
+* Remembers boolean expressions that are always true
+
+Semantically, the function behaves like a conditional check:
+```python
+if not cond:
+    raise RuntimeError(msg_fn())
+```
+
+But there a number of key differences:
+
+* The condition is always assumed true at compile time, even if it involves unbacked `SymInts`. The actual check is deferred to runtime, avoiding
+compile-time errors. Instead of setting up a guard, we implement a
+deferred runtime assertion to verify the condition at runtime. At compile
+time, we assume the condition won't trigger an error, so we don't need
+to determine if it evaluates to `True` or `False`.
+
+* If you perform an equality test `u0 = RHS`, we try to replace all instances
+of `u0` with RHS. We will ALWAYS do this if RHS has no unbacked symbols,
+as removing unbacked symbols is beneficial—eliminating them prevents
+the creation of a `GuardOnDataDependentSymNode`. Even if we are not able
+to eliminate u0, we can refine its value range. The value range specifies
+what the set of possible values for a variable are. By default, size-like
+unbacked SymInts have a value range of `[0, Inf]`; if you assert it is
+equal to an expression with a refined value range, say `[2, 20]`, then
+`u0`’s value range will be updated to `[2, 20]`. We also have limited
+support for propagating value ranges in reverse.
+
+* If you perform a boolean test `f(u0)`, we will remember that this expression always evaluates to True, and if you evaluate an expression that contains this expression, we will substitute it with True. We also support some limited reasoning on logically equivalent statements. For example, if you `torch._check(u0 < 4)`, we will also know that `u0 >= 4` evaluates to `False`, and so performing a test like this in a normal non-check conditional will go through fine.
+
+
+### `torch._check_is_size(size)` and `guard_size_oblivious(cond)`
+
+Example:
+```python
+u0 = y.item()
+torch._check_is_size(u0)
+```
+
+**Semantic Equivalent:**
+
+```python
+if u0 < 0:
+    raise RuntimeError("u0 is not a size")
+```
+
+**Key Differences:**
+
+Like `torch._check`, this test will always succeed at compile time, and it will establish that `u0 >= 0`. This refines the value range of `u0` to `[0, Inf]` instead of `[-Inf, Inf]`.
+
+Marking `u0` as size-like is crucial. Size-like unbacked `SymInts` behave like
+their regular counterparts, except when involved in a boolean expression
+evaluated with `guard_size_oblivious`. In such cases, they are assumed not to equal zero or one, temporarily setting their value range to `[2, Inf]`. For instance, a conditional check like `u0 == 1` will evaluate to `False` when `u0` is size-like, instead of causing an error.
+
+For example, `guard_size_oblivious(u0 == 1)` will always return `False` when `u0`
+is size-like.
+
+Marking unbacked symbols as size-like is essential in contexts where tensor
+sizes are expected. PyTorch internals often check if sizes are zero or one to
+handle special cases related to empty or single-element tensors. If you pass an
+unbacked symbol to a factory function like `torch.empty`, it will automatically
+be marked as size-like. However, some quantities, like arguments to `Tensor.view`,
+cannot be inferred as size-like because `-1` is a valid argument. In such cases,
+you need to explicitly use `torch._check_is_size` on an unbacked `SymInt` before
+passing it to `view`.
+
+In PyTorch framework code, if you need to test a size for zero or one, wrap the
+test in `guard_size_oblivious` to assume that size-like unbacked `SymInts` will
+not pass this test. Generally, most framework code has logic for the `>= 2`
+case, which works for the `0/1` case. If using `guard_size_oblivious` in
+PyTorch framework code resolves your issue, it's likely acceptable. However,
+avoid using `guard_size_oblivious` in user code, especially if different
+behavior is required for the `0/1` case at runtime, such as in a
+hand-tracking application.
+
+In C++, this can be done with `TORCH_GUARD_SIZE_OBLIVIOUS(u0.sym_eq(0))`, for example.
+
+### torch._check_is_size(size, max=upper_bound) (New)
+
+This function is semantically equivalent to `torch._check(size <= upper_bound)`.
+However, under `guard_size_oblivious`, it assumes that `size < upper_bound`.
+This functionality only works when the upper bound is an integer constant. If
+`upper_bound` is a symbolic expression, normal semantics apply. There is
+potential to extend this functionality to symbolic expressions with further
+development.
+
+For more details, see the related issue https://github.com/pytorch/pytorch/issues/120288.
+
+
+### `torch._constrain_as_value` and `torch._constrain_as_size`
+
+These APIs are more specialized and are effectively equivalent to
+`torch._check` and `torch._check_is_size`, with the added capability
+of adjusting the value range of a variable by specifying minimum and
+maximum values. However, in recommendation models, these functions are
+unlikely to resolve `GuardOnDataDependentSymNode` errors effectively.
+
+While `constrain_as_value` might seem like a convenient way to ensure a
+variable stays within the bounds of another tensor, it is often impractical.
+This is because value ranges only support constant bounds, and it's common
+for the tensor you want to index into to have a symbolic dimension (for
+example, `s0`). Using its size as the maximum value for a value range
+will force specialization, which is usually undesirable. Instead, if
+necessary, manually handle range checks by using `torch._check()` on
+appropriate expressions based on the errors you encounter.
+
+## Common Fix Patterns
+
+There are several common methods to resolve issues like this. Below,
+we outline the most frequently used solutions.
+
+### When It's Unfixable
+
+In some cases, the issue is genuinely unfixable due to the nature of the code.
+Consider the following example:
+
+```python
+i = x.item()
+if i > 4:
+  return x * 2
+else:
+  return x + 3
+```
+
+If the user code is branching on a data-dependent value, it is impossible to
+trace as is. In such cases, you may need to consider alternative approaches,
+such as using `torch.cond`.
+
+Another common pattern involves indexing with a data-dependent value:
+
+```python
+return self.mlps[x.item()]
+```
+
+Here, `self.mlps` is a Python list or `ModuleList`, and the code branches on a data-dependent value. The simplest solution is to induce a graph break before the indexing operation.
+
+### `u0` is a Size, but We Don’t Know It
+
+Some guards fail on tests that essentially ask, "Is this a size?" but we don't know it is a size. These fall into two categories:
+
+1. **Regular Tests:**
+
+   These are tests like `u0 >= 0` or `u0 != -1` that are unconditionally true
+   for sizes. Adding a `torch._check_is_size(...)` on the relevant size will
+   assert that these tests are true. This is typically uncommon because if
+   the test is for error checking, we can infer that the condition must be
+   true, as an error would occur otherwise. An important exception is APIs
+   that accept both sizes and `-1`; in such cases, the user must indicate that
+   the input data-dependent quantity cannot be `-1`, as something unusual would
+   happen otherwise. For an example, see
+   https://github.com/pytorch/pytorch/pull/107788.
+
+   Sometimes, you can refactor an error-checking API to split a logical
+   disjunction of conditionals into separate conditionals. If you can do so
+   to achieve a single `torch._check(x == y)` statement, it will enable
+   the automatic generation of a deferred runtime assertion. For an example,
+   see https://github.com/pytorch/pytorch/pull/110979.
+
+2. **Edge Case Tests:**
+
+   These are tests like `u0 == 0` or `u0 == 1`, which are not always true for
+   sizes, but where our choice doesn’t really matter. These tests handle edge
+   cases, such as dealing with an empty tensor or testing for broadcasting when
+   we want to assume broadcasting is not occurring. To resolve these situations,
+   two steps are needed:
+
+   * First, the guard itself must be evaluated via `guard_size_oblivious`,
+   which assumes that size-like integers cannot equal zero or one, with the
+   promise that if they do, something reasonable will happen.
+   * Second, the symbols themselves must be marked as size-like, either
+   inferred because they were passed to tensor factory functions or explicitly
+   specified with `torch._check_is_size(...)`. For examples of making guards
+   size-oblivious, see https://github.com/pytorch/pytorch/pull/118579.
+
+Sometimes, these tests can occur in C++. While there are corresponding
+C++ APIs for these tests, it can be more challenging to localize the problem,
+as you do not get a useful backtrace by default.
+
+### `u0` is Actually Equal to `u1`, but We Don’t Know It
+
+Multiple unbacked `SymInts` can be known to be equal at compile time:
+
+```python
+i0 = x.sum().item()
+i1 = x.sum().item()
+return torch.randn(i0) + torch.randn(i1)
+```
+
+If there is a `torch._check(i0 == i1)` somewhere (in the example above, this
+check would occur inside the shape-checking rule for addition), we will
+automatically unify the two unbacked `SymInts` and recognize them as equal.
+However, if such an assertion is missing, you may need to explicitly add an
+assertion to achieve this unification. For an example, see
+https://github.com/pytorch/pytorch/issues/111950).
+
+```{note}
+If we allocate an unbacked `SymInt` and
+immediately set it equal to another, these instances are benign and not easily
+eliminated entirely from the framework.
+```
+
+### `u0` is a Tensor
+
+Another reason you might be overallocating unbacked `SymInts` is due to passing
+around a `Tensor` and relying on its implicit conversion to an integer. Many
+functions that accept an integer will also accept a `Tensor` and automatically
+call `item()` on the integer argument. It's beneficial to examine
+`TORCH_LOGS=dynamic` to determine whether the number of unbacked `SymInts` is
+as expected or excessive. When this occurs, a new `SymInt` will be allocated at
+the line where a PyTorch function is invoked.
+
+This issue is less likely to cause problems now because the return value of
+`t.item()` is memoized, ensuring that you consistently receive the same unbacked
+`SymInt` if you call it multiple times.
+
+### Overspecialization Issue
+
+In non-strict export mode, consider the following code:
+
+```python
+u0 = x.sum().item() return y[:u0]
+```
+
+This code will fail when trying to evaluate `u0` because, when a `SymInt` is
+used directly inside a Python slice (without using Dynamo), Python forces the
+integer to be specialized and fails if it is unbacked.
+
+To resolve this, you can rewrite the program to avoid specialization.
+For the example above, you can fix it by not using slices:
+
+```python
+u0 = x.sum().item() return y.narrow(0, 0, u0)
+```
+
+For more details, see the related issue
+https://github.com/pytorch/pytorch/issues/111950.
+
+### Use Lengths Instead of Offsets
+
+When working with variable sequence lengths, it's common to have tensors
+representing either the lengths or offsets of the sequences. For example, given
+`values = [[1, 2, 3], [4, 5], [6, 7, 8, 9]]`, you might have `lengths = [3, 2, 4]`
+and `offsets = [0, 3, 5, 9]`. While these representations are interconvertible,
+it's better to work with lengths when dealing with them as integers (by calling
+`lengths.tolist()`), rather than offsets.
+
+The reason is that when you perform a `torch.split()` on your `values` tensor, you
+need to create tensors for each sub-sequence, such as tensors of sizes 3, 2, and 4.
+If you have unbacked `SymInts` for sizes, they become `u0`, `u1`, and `u2`. You can
+easily indicate that they are size-like, and you're done. However, if you have
+unbacked `SymInts` for offsets, they become `u1 - u0`, `u2 - u1`, `u3 - u2`, which
+complicates matters. These quantities cannot be conveniently marked as size-like,
+leading to potential issues. Since it's relatively straightforward to write code
+using either lengths or offsets, you should prefer using lengths.
+
+```{seealso}
+* {ref}`dynamic_shapes`
+* {ref}`debugging-tlparse-torch-logs`
+```
--- a/docs/source/compile/dynamic_shapes_zero_one_specialization.md
+++ b/docs/source/compile/dynamic_shapes_zero_one_specialization.md
@ -0,0 +1,33 @@
+(zero-one-specialization)=
+# The Zero-One Specialization Problem
+
+Before you read this section, you should understand the basics of
+dynamic shapes. Make sure you have read the following sections:
+
+* {ref}`dynamic_shapes`
+* {ref}`torch.export`
+* {ref}`what_is_a_specialization`
+
+In `torch.compile`, we specialize automatically on inputs with sizes
+0 or 1 and assume that any remaining inputs cannot be 0 or 1. This
+simplifies tasks like contiguity and broadcasting checks, as it
+avoids adding extra guards. However, this can cause problems for
+sparse models with many symbolic integers that in practice have
+tensors of size 0, 1, or 2. For example, consider when you a task is
+something like collecting likes on page.
+
+While it's possible to stop specializing on 0/1 upfront, executing
+normal PyTorch code often reintroduces 0/1 guards, as many conditions
+in PyTorch check for values being 0 or 1. Although models that work
+for `N > 2` often generalize to `N = 1`, this isn't guaranteed, especially
+with symbolic variables. For example, in hand tracking, a dimension
+size of `N = 0`, `1`, or `2` may lead to different graph behaviors.
+Simply hoping that the `N > 2` model generalizes can expose soundness issues.
+
+
+```{seealso}
+* {ref}`dynamic_shapes`
+* {ref}`torch.export`
+* {ref}`what_is_a_specialization`
+* {ref}`backed-vs-unbacked-symints`
+```
--- a/docs/source/cond.md
+++ b/docs/source/cond.md
@ -34,75 +34,75 @@ Read more about feature classification at: https://pytorch.org/blog/pytorch-feat
 Below is an example that uses cond to branch based on input shape:

 ```python
-    import torch
+import torch

-    def true_fn(x: torch.Tensor):
-        return x.cos() + x.sin()
+def true_fn(x: torch.Tensor):
+    return x.cos() + x.sin()

-    def false_fn(x: torch.Tensor):
-        return x.sin()
+def false_fn(x: torch.Tensor):
+    return x.sin()

-    class DynamicShapeCondPredicate(torch.nn.Module):
-        """
-        A basic usage of cond based on dynamic shape predicate.
-        """
+class DynamicShapeCondPredicate(torch.nn.Module):
+    """
+    A basic usage of cond based on dynamic shape predicate.
+    """

-        def __init__(self):
-            super().__init__()
+    def __init__(self):
+        super().__init__()

-        def forward(self, x: torch.Tensor) -> torch.Tensor:
-            def true_fn(x: torch.Tensor):
-                return x.cos()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        def true_fn(x: torch.Tensor):
+            return x.cos()

-            def false_fn(x: torch.Tensor):
-                return x.sin()
+        def false_fn(x: torch.Tensor):
+            return x.sin()

-            return torch.cond(x.shape[0] > 4, true_fn, false_fn, (x,))
+        return torch.cond(x.shape[0] > 4, true_fn, false_fn, (x,))

-    dyn_shape_mod = DynamicShapeCondPredicate()
+dyn_shape_mod = DynamicShapeCondPredicate()
 ```

 We can eagerly run the model and expect the results vary based on input shape:

 ```python
-    inp = torch.randn(3)
-    inp2 = torch.randn(5)
-    assert torch.equal(dyn_shape_mod(inp), false_fn(inp))
-    assert torch.equal(dyn_shape_mod(inp2), true_fn(inp2))
+inp = torch.randn(3)
+inp2 = torch.randn(5)
+assert torch.equal(dyn_shape_mod(inp), false_fn(inp))
+assert torch.equal(dyn_shape_mod(inp2), true_fn(inp2))
 ```

 We can export the model for further transformations and deployment:

 ```python
-    inp = torch.randn(4, 3)
-    dim_batch = torch.export.Dim("batch", min=2)
-    ep = torch.export.export(DynamicShapeCondPredicate(), (inp,), {}, dynamic_shapes={"x": {0: dim_batch}})
-    print(ep)
+inp = torch.randn(4, 3)
+dim_batch = torch.export.Dim("batch", min=2)
+ep = torch.export.export(DynamicShapeCondPredicate(), (inp,), {}, dynamic_shapes={"x": {0: dim_batch}})
+print(ep)
 ```

 This gives us an exported program as shown below:

 ```
-    class GraphModule(torch.nn.Module):
+class GraphModule(torch.nn.Module):
+    def forward(self, arg0_1: f32[s0, 3]):
+        sym_size: Sym(s0) = torch.ops.aten.sym_size.int(arg0_1, 0)
+        gt: Sym(s0 > 4) = sym_size > 4;  sym_size = None
+        true_graph_0 = self.true_graph_0
+        false_graph_0 = self.false_graph_0
+        conditional: f32[s0, 3] = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, [arg0_1]);  gt = true_graph_0 = false_graph_0 = arg0_1 = None
+        return (conditional,)
+
+    class <lambda>(torch.nn.Module):
        def forward(self, arg0_1: f32[s0, 3]):
-            sym_size: Sym(s0) = torch.ops.aten.sym_size.int(arg0_1, 0)
-            gt: Sym(s0 > 4) = sym_size > 4;  sym_size = None
-            true_graph_0 = self.true_graph_0
-            false_graph_0 = self.false_graph_0
-            conditional: f32[s0, 3] = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, [arg0_1]);  gt = true_graph_0 = false_graph_0 = arg0_1 = None
-            return (conditional,)
+            cos: f32[s0, 3] = torch.ops.aten.cos.default(arg0_1)
+            sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
+            add: f32[s0, 3] = torch.ops.aten.add.Tensor(cos, sin);  cos = sin = None
+            return add

-        class <lambda>(torch.nn.Module):
-            def forward(self, arg0_1: f32[s0, 3]):
-                cos: f32[s0, 3] = torch.ops.aten.cos.default(arg0_1)
-                sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
-                add: f32[s0, 3] = torch.ops.aten.add.Tensor(cos, sin);  cos = sin = None
-                return add
-
-        class <lambda>(torch.nn.Module):
-            def forward(self, arg0_1: f32[s0, 3]):
-                sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
-                return sin
+    class <lambda>(torch.nn.Module):
+        def forward(self, arg0_1: f32[s0, 3]):
+            sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
+            return sin
 ```

 Notice that `torch.cond` is lowered to `torch.ops.higher_order.cond`, its predicate becomes a Symbolic expression over the shape of input,
@ -111,41 +111,41 @@ and branch functions becomes two sub-graph attributes of the top level graph mod
 Here is another example that showcases how to express a data-dependent control flow:

 ```python
-    class DataDependentCondPredicate(torch.nn.Module):
-        """
-        A basic usage of cond based on data dependent predicate.
-        """
-        def __init__(self):
-            super().__init__()
+class DataDependentCondPredicate(torch.nn.Module):
+    """
+    A basic usage of cond based on data dependent predicate.
+    """
+    def __init__(self):
+        super().__init__()

-        def forward(self, x: torch.Tensor) -> torch.Tensor:
-            return torch.cond(x.sum() > 4.0, true_fn, false_fn, (x,))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.cond(x.sum() > 4.0, true_fn, false_fn, (x,))
 ```

 The exported program we get after export:

 ```
-    class GraphModule(torch.nn.Module):
+class GraphModule(torch.nn.Module):
+    def forward(self, arg0_1: f32[s0, 3]):
+        sum_1: f32[] = torch.ops.aten.sum.default(arg0_1)
+        gt: b8[] = torch.ops.aten.gt.Scalar(sum_1, 4.0);  sum_1 = None
+
+        true_graph_0 = self.true_graph_0
+        false_graph_0 = self.false_graph_0
+        conditional: f32[s0, 3] = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, [arg0_1]);  gt = true_graph_0 = false_graph_0 = arg0_1 = None
+        return (conditional,)
+
+    class <lambda>(torch.nn.Module):
        def forward(self, arg0_1: f32[s0, 3]):
-            sum_1: f32[] = torch.ops.aten.sum.default(arg0_1)
-            gt: b8[] = torch.ops.aten.gt.Scalar(sum_1, 4.0);  sum_1 = None
+            cos: f32[s0, 3] = torch.ops.aten.cos.default(arg0_1)
+            sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
+            add: f32[s0, 3] = torch.ops.aten.add.Tensor(cos, sin);  cos = sin = None
+            return add

-            true_graph_0 = self.true_graph_0
-            false_graph_0 = self.false_graph_0
-            conditional: f32[s0, 3] = torch.ops.higher_order.cond(gt, true_graph_0, false_graph_0, [arg0_1]);  gt = true_graph_0 = false_graph_0 = arg0_1 = None
-            return (conditional,)
-
-        class <lambda>(torch.nn.Module):
-            def forward(self, arg0_1: f32[s0, 3]):
-                cos: f32[s0, 3] = torch.ops.aten.cos.default(arg0_1)
-                sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
-                add: f32[s0, 3] = torch.ops.aten.add.Tensor(cos, sin);  cos = sin = None
-                return add
-
-        class <lambda>(torch.nn.Module):
-            def forward(self, arg0_1: f32[s0, 3]):
-                sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
-                return sin
+    class <lambda>(torch.nn.Module):
+        def forward(self, arg0_1: f32[s0, 3]):
+            sin: f32[s0, 3] = torch.ops.aten.sin.default(arg0_1);  arg0_1 = None
+            return sin
 ```

 ## Invariants of torch.ops.higher_order.cond
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -509,10 +509,6 @@ coverage_ignore_functions = [
    "custom_fwd",
    # torch.cuda.amp.common
    "amp_definitely_not_available",
-    # torch.cuda.graphs
-    "graph_pool_handle",
-    "is_current_stream_capturing",
-    "make_graphed_callables",
    # torch.mtia.memory
    "reset_peak_memory_stats",
    # torch.cuda.nccl
@ -524,25 +520,11 @@ coverage_ignore_functions = [
    "reduce_scatter",
    "unique_id",
    "version",
-    # torch.cuda.nvtx
-    "range",
-    "range_end",
-    "range_start",
    # torch.cuda.profiler
    "init",
    "profile",
    "start",
    "stop",
-    # torch.cuda.random
-    "get_rng_state",
-    "get_rng_state_all",
-    "initial_seed",
-    "manual_seed",
-    "manual_seed_all",
-    "seed",
-    "seed_all",
-    "set_rng_state",
-    "set_rng_state_all",
    # torch.distributed.algorithms.ddp_comm_hooks.ddp_zero_hook
    "hook_with_zero_step",
    "hook_with_zero_step_interleaved",
@ -2172,8 +2154,6 @@ coverage_ignore_classes = [
    "EventHandler",
    "SynchronizationError",
    "UnsynchronizedAccessError",
-    # torch.cuda.memory
-    "MemPool",
    # torch.distributed.elastic.multiprocessing.errors
    "ChildFailedError",
    "ProcessFailure",
@ -2479,10 +2459,6 @@ coverage_ignore_classes = [
    # torch.amp.grad_scaler
    "GradScaler",
    "OptState",
-    # torch.cuda.graphs
-    "CUDAGraph",
-    # torch.cuda.streams
-    "Event",
    # torch.distributed.algorithms.ddp_comm_hooks.post_localSGD_hook
    "PostLocalSGDState",
    # torch.distributed.algorithms.ddp_comm_hooks.powerSGD_hook
@ -3176,8 +3152,6 @@ coverage_ignore_classes = [
    "WeakIdKeyDictionary",
    "WeakIdRef",
    "WeakTensorKeyDictionary",
-    # torch.utils.debug_mode
-    "DebugMode",
 ]

 # The suffix(es) of source filenames.
--- a/docs/source/cuda.aliases.md
+++ b/docs/source/cuda.aliases.md
@ -0,0 +1,47 @@
+# Aliases in torch.cuda
+
+The following are aliases to their counterparts in ``torch.cuda`` in the nested namespaces in which they are defined. For any of these APIs, feel free to use the top-level version in ``torch.cuda`` like ``torch.cuda.seed`` or the nested version ``torch.cuda.random.seed``.
+
+```{eval-rst}
+.. automodule:: torch.cuda.random
+.. currentmodule:: torch.cuda.random
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    get_rng_state
+    get_rng_state_all
+    set_rng_state
+    set_rng_state_all
+    manual_seed
+    manual_seed_all
+    seed
+    seed_all
+    initial_seed
+```
+
+```{eval-rst}
+.. automodule:: torch.cuda.graphs
+.. currentmodule:: torch.cuda.graphs
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    is_current_stream_capturing
+    graph_pool_handle
+    CUDAGraph
+    graph
+    make_graphed_callables
+```
+
+```{eval-rst}
+.. automodule:: torch.cuda.streams
+.. currentmodule:: torch.cuda.streams
+.. autosummary::
+    :toctree: generated
+    :nosignatures:
+
+    Stream
+    ExternalStream
+    Event
+```
--- a/docs/source/cuda.md
+++ b/docs/source/cuda.md
@ -274,10 +274,6 @@ See the docs for {class}`~torch.cuda.gds.GdsFile` for an example of how to use t
 .. py:module:: torch.cuda.gds
 ```

-```{eval-rst}
-.. py:module:: torch.cuda.graphs
-```
-
 ```{eval-rst}
 .. py:module:: torch.cuda.jiterator
 ```
@ -294,14 +290,13 @@ See the docs for {class}`~torch.cuda.gds.GdsFile` for an example of how to use t
 .. py:module:: torch.cuda.profiler
 ```

-```{eval-rst}
-.. py:module:: torch.cuda.random
-```
-
 ```{eval-rst}
 .. py:module:: torch.cuda.sparse
 ```

 ```{eval-rst}
-.. py:module:: torch.cuda.streams
-```
+.. toctree::
+    :hidden:
+
+    cuda.aliases.md
+```
--- a/docs/source/torch.compiler.md
+++ b/docs/source/torch.compiler.md
@ -82,55 +82,48 @@ Some of the most commonly used backends include:

 ## Read More

-```{eval-rst}
-.. toctree::
-   :caption: Getting Started for PyTorch Users
-   :maxdepth: 1
+```{toctree}
+:caption: Getting Started for PyTorch Users
+:maxdepth: 2

-   torch.compiler_get_started
-   torch.compiler_api
-   torch.compiler.config
-   torch.compiler_fine_grain_apis
-   torch.compiler_backward
-   torch.compiler_aot_inductor
-   torch.compiler_inductor_profiling
-   torch.compiler_profiling_torch_compile
-   torch.compiler_faq
-   torch.compiler_troubleshooting
-   torch.compiler_performance_dashboard
-   torch.compiler_inductor_provenance
+torch.compiler_get_started
+torch.compiler_api
+torch.compiler.config
+torch.compiler_dynamic_shapes
+torch.compiler_fine_grain_apis
+torch.compiler_backward
+torch.compiler_aot_inductor
+torch.compiler_inductor_profiling
+torch.compiler_profiling_torch_compile
+torch.compiler_faq
+torch.compiler_troubleshooting
+torch.compiler_performance_dashboard
+torch.compiler_inductor_provenance
 ```

-```{eval-rst}
-.. toctree::
-   :caption: `torch.compile` Programming Model
+```{toctree}
+:caption: torch.compile Programming Model
+:maxdepth: 2

-   compile/programming_model
+compile/programming_model
 ```

-% _If you want to contribute a developer-level topic
-%  that provides in-depth overview of a torch._dynamo feature,
-%  add in the below toc.
+```{toctree}
+:caption: Deep Dive for PyTorch Developers
+:maxdepth: 1

-```{eval-rst}
-.. toctree::
-   :caption: Deep Dive for PyTorch Developers
-   :maxdepth: 1
-
-   torch.compiler_dynamo_overview
-   torch.compiler_dynamo_deepdive
-   torch.compiler_dynamic_shapes
-   torch.compiler_nn_module
-   torch.compiler_cudagraph_trees
-   torch.compiler_fake_tensor
+torch.compiler_dynamo_overview
+torch.compiler_dynamo_deepdive
+torch.compiler_nn_module
+torch.compiler_cudagraph_trees
+torch.compiler_fake_tensor
 ```

-```{eval-rst}
-.. toctree::
-   :caption: HowTo for PyTorch Backend Vendors
-   :maxdepth: 1
+```{toctree}
+:caption: HowTo for PyTorch Backend Vendors
+:maxdepth: 1

-   torch.compiler_custom_backends
-   torch.compiler_transformations
-   torch.compiler_ir
+torch.compiler_custom_backends
+torch.compiler_transformations
+torch.compiler_ir
 ```
--- a/docs/source/torch.compiler_dynamic_shapes.md
+++ b/docs/source/torch.compiler_dynamic_shapes.md
@ -1,129 +1,295 @@
-# Dynamic Shapes
+---
+file_format: mystnb
+kernelspec:
+  name: python3
+mystnb:
+  execution_timeout: 30
+  execution_show_tb: True
+  merge_streams: True
+---

-Code: [symbolic_shapes.py](https://github.com/pytorch/pytorch/blob/db4572dbf18f1cf50cf662547e272d3117063747/torch/fx/experimental/symbolic_shapes.py)
+```{code-cell}
+:tags: [remove-cell]
+import torch
+from compile import header_code

-See also: [The dynamic shapes manual](https://docs.google.com/document/d/1GgvOe7C8_NVOMLOCwDaYV1mXXyHMXY7ExoewHqooxrs/edit#heading=h.fh8zzonyw8ng)
-
-## Motivation
-
-Deep learning compilers commonly only work for static shapes, that is to say, they produced compiled programs which only work for a single specific configuration of input shapes, and must recompile if any input shape changes. This assumption works great for the majority of commonly run deep learning models today, but there are a few situations where it is insufficient:
-
- Some dimensions, such as batch size or sequence length, may vary. For example, an inference service performing adaptive batching will execute inference requests with varying batch sizes depending on how many requests it received within its batching window. We may also want to consider padding out variable size sequences only to the maximum sequence length within a batch, which may vary from batch-to-batch.
- Some models exhibit data-dependent output shapes, that is to say, the size of their outputs and intermediates may depend on the actual input data which may vary across runs. For example, detection models may first generate a variable number of potential bounding boxes before running a more expensive image recognition model to identify if the subject is in a bounding box. The number of bounding boxes is data dependent.
- One particularly important case of data-dependent shapes occurs when dealing with sparse representations, such as sparse tensors, jagged tensors, and graph neural networks. In all of these cases, the amount of data to be processed depends on the sparse structure of the problem, which will typically vary in a data-dependent way.
-
-In supporting dynamic shapes, we chose not to support dynamic rank programs, e.g., programs whose inputs tensors change in dimensionality, as this pattern rarely occurs in real-world deep learning programs, and it avoids the need to reason inductively over symbolic lists of shapes.
-
-## Abridged public API
-
-The default dynamic behavior in PyTorch 2.1 is:
-
- PT2 assumes everything is static by default
- If we recompile because a size changed, we will instead attempt to recompile
-  that size as being dynamic (sizes that have changed are likely to change in
-  the future). This generalization may fail (e.g., because user code does a
-  conditional branch on the size in question or missing dynamic shapes support
-  in PT2). If you are trying to understand why PT2 has overspecialized some
-  code, run with `TORCH_LOGS=dynamic` and look for "eval" entries that say
-  when guards are added and why.
- If you know ahead of time something will be dynamic, you can skip the first
-  recompile with `torch._dynamo.mark_dynamic(tensor, dim)`. If you know ahead of time
-  the `min` and `max` value this dimension can take, you can specify `torch._dynamo.mark_dynamic(tensor, dim, min=min, max=max)`
- If you say `torch.compile(dynamic=False)`, we will turn off automatic
-  dynamic shapes on recompiles and always recompile for each distinct size.
-  Conversely, if you say `torch.compile(dynamic=True)`, we will try to make
-  everything as dynamic as possible. This is mostly useful for small
-  operators; if you try it on a big model it will (1) probably crash PT2 and (2) run slow for no good reason.
- You can whitelist specific sources to be marked as dynamic using the
-  `TORCH_COMPILE_DYNAMIC_SOURCES` environment variable or by setting
-  `torch.compiler.config.dynamic_sources`. This is particularly useful for large
-  models with graph breaks, as you can maintain dynamism across graph breaks since
-  source names stay consistent. You can also use this to mark integers as dynamic.
-  The format is a comma-delimited list of source names, e.g., `"L['x'], L['y']"`.
-  You can also use regexes, e.g., `"L\['x.*'\], L\['y.*'\]")`.
-  This whitelist takes precedence over other flags like `dynamic=False`,
-  `force_nn_module_property_static_shapes`, and `force_parameter_static_shapes`.
- Sometimes it can be cumbersome to find the right inputs to mark as dynamic. If
-  you're willing to take a performance hit for the first batch, one other affordable
-  option we have are the eager_then_compile stances which derive dynamism for you.
-  See [torch.compiler.set_stance](https://docs.pytorch.org/docs/stable/generated/torch.compiler.set_stance.html) for more details.
-
-## The Guard Model
-
-When considering how to add support for dynamic shapes to TorchDynamo and TorchInductor, we made a major design decision: in order to reuse decompositions and other preexisting code written in Python/C++ targeting the PyTorch API, we must be able to trace through dynamic shapes. Unlike a fully symbolic system which might capture both branches of a conditional, we always pick one branch and specialize our trace under the assumption that we only use this trace when we would have made the same choice for that branch in the future. To do this, we maintain a "hint" for every symbolic size saying what its concrete value is at compile time (as TorchDynamo is a just-in-time compiler, it always knows what the actual input sizes are.) When we perform a condition on a tensor, we simply consult the hint to find out which branch to take.
-
-This greatly simplifies the symbolic shape formulas we produce, but means we have a much more involved system for managing guards. Consider, for example, the following program:
-
-```python
-def f(x, y):
-    z = torch.cat([x, y])
-    if z.size(0) > 2:
-        return z.mul(2)
-    else:
-        return z.add(2)
+torch._logging.set_logs(graph_breaks=True, graph_code=True)
 ```

-The final IR we will compile with TorchInductor will either be `torch.cat([x, y]).add(2)` or `torch.cat([x, y]).mul(2)` (with the condition flattened away), but to determine which branch we are in, we would need to know the size of `z`, an intermediate. Because TorchDynamo must know upfront if a compiled trace is valid (we do not support bailouts, like some JIT compilers), we must be able to reduce `z.size(0)` as an expression in terms of the inputs, `x.size(0) + y.size(0)`. This is done by writing meta functions for all operators in PyTorch which can propagate size information to the output of a tensor without actually performing computation on the node.
+(dynamic_shapes)=
+# Dynamic Shapes

-## Overall architecture
+This section explains how to work with dynamic shapes in PyTorch, including how
+to debug and fix common errors, implement support for dynamic shapes in
+operators, and understand the underlying mechanisms.

-Symbolic shapes workflow:
+Dynamic shapes allow PyTorch models to handle inputs with varying dimensions
+without recompilation. This enables more flexible models that can process
+different batch sizes, sequence lengths, or image dimensions in a single
+compiled artifact. Dynamic shapes work by symbolically tracing tensor
+dimensions rather than using concrete values, creating a computation
+graph that adapts to different input shapes at runtime. By default,
+PyTorch assumes all input shapes to be static.

-1. When we start compiling a frame in Dynamo, we allocate a ShapeEnv (attached to FakeTensorMode) which keeps track of symbolic shapes state.
-2. We allocate symbolic sizes for tensors on entry (what is static or dynamic is a policy decision, with some knobs).
-3. We propagate the symbolic sizes through operators, maintaining both (1) FX IR so that we can faithfully export symbolic compute, and (2) Sympy expressions representing the size vars, so we can reason about them.
-4. When we condition on symbolic sizes, either in Dynamo tracing or in Inductor optimization, we add guards based on the conditional. These can be induced from both Python and C++.
-5. These guards can induce further simplifications on symbolic variables. For example, if you assert `s0 == 4`, we can now replace all occurrences of `s0` with `4`.
-6. When we're done tracing and optimizing, we install all of these guards with the compiled code; the compiled code is only reusable if all the guards evaluate true.
+Typically, deep learning compilers only support static shapes, requiring
+recompilation for input shape changes. While this approach covers many use cases,
+there are situations where this is insufficient:

-Important files:
+- **Variable Dimensions** - Batch sizes or sequence lengths vary, such as in
+adaptive batching.
+- **Data-Dependent Outputs** - Models produce outputs based on input data,
+like variable bounding boxes in detection models.
+- **Sparse Representations** - Processing depends on data-varying sparse structures,
+such as in sparse tensors, jagged tensors, and graph neural networks.

- C++ SymInt API: `c10/core/SymInt.h`, `SymFloat.h`, `SymBool.h`
- Python SymInt API: `torch/__init__.py` (look for `SymInt/SymFloat/SymBool`)
- C++ plumbing: `c10/core/SymNodeImpl.h`, `torch/csrc/utils/python_symnode.h`, `torch/csrc/jit/python/init.cpp`
- Python infrastructure: `torch/fx/experimental/symbolic_shapes.py`
- Other important files: `torch/_subclasses/fake_tensor.py`, `torch/_meta_registrations.py`, decomps, PrimTorch refs
+Dynamic shapes do not support dynamic rank programs, programs which input tensors
+change in dimensionality, as this is uncommon and unnecessarily complex.

-## Abridged internal API

-Understanding the Python class hierarchy:
+## What does it mean for a size/integer to be dynamic?

- SymInt/SymFloat/SymBool: these are user-visible classes that simulate their int/float/bool counterparts. If you add two SymInts, we give you a new SymInt that symbolically tracks that the integer addition had occurred.
- SymNode: this is the internal structure (accessible via e.g., `symint.node`) which holds the actual symbolic tracking info. SymNode is type erased; this makes it more convenient to represent mixed-type operations. Note that technically you don't have to call into Python SymNode from SymInt; for example, XLA's C++ `SymNodeImpl` would take the place of SymNode.
- ShapeEnv: per-compile context state which keeps track of all the free symbols and guards we have accumulated so far. Every SymNode records its ShapeEnv (but not vice versa; SymNodes only get used if they participate in a guard).
+Dynamic shapes allow avoiding recompilations by making certain dimensions or integers
+dynamic. For example, if a function `f(x)` is compiled with a static size, it will need
+recompilation for different sizes:

-C++ is fairly similar:
+```{note}
+For simplicity, this example uses `@torch.compile(dynamic=True)`. Note, that
+this option is not recommended due to it being error prone.
+For a recommended way of enabling dynamic shapes, see {ref}`enable-dynamic-behavior`.
+```

- c10::SymInt/SymFloat/SymBool: user-visible classes that simulate int/float/bool.
- c10::SymNode/SymNodeImpl: analogous to SymNode
- There is no ShapeEnv in C++; for ease of debugging, the entire symbolic reasoning apparatus is in Python.
+```{code-cell}
+import torch
+@torch.compile(dynamic=False)
+def f(x):
+     return x* x.size()[0]

-When you write code that is traceable with `make_fx`, it must be able to deal with SymInt/SymFloat/SymBool flowing through it. [The dynamic shapes manual](https://docs.google.com/document/d/1GgvOe7C8_NVOMLOCwDaYV1mXXyHMXY7ExoewHqooxrs/edit#heading=h.fh8zzonyw8ng) gives some guidance for how to do this.
+f(torch.rand(10))
+f(torch.rand(20))
+f(torch.rand(30))
+f(torch.rand(40))
+```

-## DimDynamic policy
+In the produced output, you can see that four graphs were generated.
+See the corresponding <a href="_static/img/dynamic_shapes/tlparse1_dynamic_shapes_false.png" target="_blank">tlparse output</a>

-Symbolic reasoning:
+By making the size dynamic, the function can handle various sizes without recompilation:

- Value ranges
- Sympy usage notes
- Constraints
- DimDynamic/Constraint
+```{code-cell}
+import torch
+@torch.compile(dynamic=True)
+def f(x):
+     return x* x.size()[0]

-## Unbacked SymInts
+f(torch.rand(10))
+f(torch.rand(20))
+f(torch.rand(30))
+f(torch.rand(40))
+```

-To resolve control flow, we check the hint, aka actual value, of a symbolic integer to determine which branch to go. However, in some cases, we may not have a hint: so-called unbacked symbolic integers arise when a size variable emerges from a data-dependent operation like `.nonzero()` or `.item()`. It is illegal to perform control flow on these symbolic integers, so we must graph break on these operations.
+With dynamic shapes enabled, only one graph is created. See the
+corresponding <a href="_static/img/dynamic_shapes/tlparse2_dynamic_shapes_true.png" target="_blank">tlparse output</a>.

-Naively implemented, this is too restrictive: most PyTorch programs will immediately fail if you try to do anything with unbacked symbolic integers. Here are the most important enhancements to make this actually work:
+While compilation time differences
+are minimal for this small example, more complex use cases would show significant
+performance improvements.

- On tensor creation, PyTorch precomputes a lot of data about a tensor; for example, if you use `empty_strided` to create a tensor, we will eagerly sort the strides and determine if the tensor is non-overlapping and dense. Sorts produce a lot of guards. However, it is more common to produce a tensor directly with a higher-level API like `empty`, which is guaranteed to produce a non-overlapping and dense tensor. We modified PyTorch to avoid needlessly recomputing these properties.
- Even if nontrivial compute is needed, sometimes a property is never actually queried at all. Making these precomputed properties lazy allows us to avoid guarding on an unbacked symbolic integer unless it is actually needed.
- The data in an integer tensor is generally not known to be non-negative. However, we provide an API `constrain_range` whereby a user can specify that a size is bounded above and below by known limits.
+(what_is_a_specialization)=
+## What is a specialization?

-Similar to the dynamic APIs, there are corresponding unbacked APIs: namely you can use mark_unbacked instead of `mark_dynamic` and `TORCH_COMPILE_UNBACKED_SOURCES` instead of `TORCH_COMPILE_DYNAMIC_SOURCES` to tell the compiler to mark an input as unbacked.
+**Specialization** refers to optimizing a computational graph for specific input shapes
+by examining shape conditions during control flow. If a branch is taken based on a
+shape condition, the graph is tailored for that condition. If a new input doesn't meet
+this condition, the system will recompile the graph.

-In future versions of PT2 (beyond PT2.1), we will extend our reasoning system
-to infer that an unbacked symbolic integer is size-like based on usage. For
-example, if you pass the result of an `.item()` call to a factory function
-like `torch.empty`, we will automatically infer that the result is a size
-(because if it was not, it would fail.) This assumption would get validated
-at runtime, raising an error if it was not fulfilled.
+Specialization allows you to create optimized computational graphs for specific input
+shapes, which can significantly improve execution speed.
+
+
+```{code-cell}
+import torch
+@torch.compile(dynamic=True)
+def f(x):
+    if x.size()[0] == 10:
+        return x * 10
+
+    if x.size()[0] <= 30:
+        return x*200
+
+    return x*x.size()[0]
+
+f(torch.rand(10))
+f(torch.rand(20))
+f(torch.rand(30))
+f(torch.rand(40))
+f(torch.rand(50))
+```
+
+In the code above, we specialize that the graph requires an input size of 10, in which
+case it will return `x * 10`. If the input size is less than 30, it will return `x * 200`.
+In the output, you can see that this creates three graphs.
+
+See the corresponding <a href="_static/img/dynamic_shapes/tlparse3_specialization.png" target="_blank">tlparse output</a>
+
+
+This is how graphs created for the above function:
+
+```{image} _static/img/dynamic_shapes/dynamic_shapes_example_specialization.png
+```
+
+(enable-dynamic-behavior)=
+## Enabling Dynamic Behavior
+
+There are the following ways to make things dynamic:
+
+* {ref}`automatic_dynamic`
+* {ref}`user_annotations` (preferred)
+* {ref}`torch_compile_dynamic_true` (for testing only)
+* {ref}`dynamic_shapes_advanced_control_options` (for advanced use cases)
+
+Read below about each of this options.
+
+(automatic_dynamic)=
+### Automatic dynamic
+
+**Automatic dynamic** is the default behavior where {func}`torch.compile` performs
+the initial compilation assuming static shapes are used, while tracking the
+input sizes from that first compilation. When a recompile is triggered, it
+uses this information to identify which dimensions have changed and marks
+those as dynamic for the second compilation.
+
+(user_annotations)=
+### User Annotations
+
+Several APIs allow users to explicitly mark specific inputs
+by name or code as dynamic. This is useful for avoiding initial compilations that
+would eventually become dynamic with the previous tools. It is also used to mark
+elements that do not automatically get marked as dynamic, such as neural network
+module parameters, and so on. User annotations are the preferred way to enable
+dynamic shapes.
+
+#### `mark_dynamic(tensor, dim, min=min, max=max)`
+
+The {func}`torch._dynamo.mark_dynamic` function marks a tensor dimension as dynamic and will fail if it
+gets specialized. It does not work for integers. Use this function only if you know
+all graphs in the frame using this input converge to a single dynamic graph.
+Otherwise, you may encounter a misleading constraint violation error.
+In such cases, consider using {func}`torch._dynamo.maybe_mark_dynamic`. Currently,
+{func}`torch._dynamo.mark_dynamic`
+does not have precedence over `force_parameter_static_shapes = True` or `force_nn_module_property_static_shapes = True`.
+
+If you know in advance that a particular dimension will be dynamic, you
+can avoid the initial recompilation by using {func}`torch._dynamo.mark_dynamic(tensor, dim)`.
+Additionally, if you already know the minimum and maximum possible
+values for this dimension, you can specify them with
+{func}`torch._dynamo.mark_dynamic(tensor, dim, min=min, max=max)`.
+
+Here is a quick example:
+
+```{code-cell}
+import torch
+
+@torch.compile(dynamic=True)
+def f(x):
+    return x * x.size()[0]
+
+x = torch.randn(10)
+torch._dynamo.mark_dynamic(x, 0)
+
+# first invocation we give it is a tensor marked as dynamic
+f(x)
+# rest of these invocations will use dynamically compiled code
+f(torch.randn(20))
+f(torch.randn(30))
+f(torch.randn(40))
+```
+
+#### `maybe_mark_dynamic(tensor, dim)`
+
+The {func}`torch._dynamo.maybe_mark_dynamic` function shares all properties
+with  {func}`torch._dynamo.mark_dynamic`
+but does not fail if the size gets specialized. Use it for inputs shared by
+multiple graphs or if the number of graphs does not converge to one for a specific
+frame. For instance, in the example above, use {func}`torch._dynamo.maybe_mark_dynamic()` because graphs
+with sizes 0 and 1 will specialize. However, you can use {func}`torch._dynamo.mark_dynamic` to ensure
+you never specialize.
+
+#### `mark_unbacked(tensor, dim)`
+
+The {func}`torch._dynamo.mark_unbacked` function marks a tensor dimension as unbacked. It is unlikely
+to be the tool you need, but it could be useful if the specialization occurs inside
+a condition `guard_size_oblivious(x)`, and if using it removes the specialization.
+Ensure it fixes the specialization and does not introduce a data-dependent error
+that converts to a graph break at or before the specialization location
+you are trying to  avoid. It might be better to use the next option.
+
+(dynamic_sources_allow_list)=
+#### Dynamic Allow List (`DYNAMIC_SOURCES`)
+
+Use the evnironmental variable `TORCH_COMPILE_DYNAMIC_SOURCES` to pass a configuration
+list of source names to be marked as dynamic. For example:
+`TORCH_COMPILE_DYNAMIC_SOURCES=L[‘x’],L[‘y’]`
+It's easiest to find these dynamic source names using the PGO artifact in `tlparse`.
+You can copy and paste the dynamic source names from the PGO artifact. This method works
+for integers and tensor sizes and has the highest precedence over all other flags
+that force static shapes. It will not throw an error if what is marked dynamic
+gets specialized or if the provided input does not exist.
+
+Here is an example:
+
+```{code-cell}
+import torch
+
+@torch.compile()
+def f(x):
+     return x * x.size()[0]
+
+with torch.compiler.config.patch(dynamic_sources="L['x']"):
+    f(torch.rand(10))
+f(torch.rand(20))
+f(torch.rand(30))
+f(torch.rand(40))
+```
+
+(torch.compiler.set_stance_eager_then_compile)=
+#### `torch.compiler.set_stance ("eager_then_compile")`
+
+At times, identifying the appropriate inputs to mark as dynamic can
+be challenging. If you are willing to accept a performance cost for
+the first batch, another convenient option is to use the
+`eager_then_compile` stances, which automatically determine dynamic
+inputs for you. For more information, see {func}`torch.compiler.set_stance` and [Dynamic Compilation Control with torch.compiler.set_stance](https://docs.pytorch.org/tutorials/recipes/torch_compiler_set_stance_tutorial.html).
+
+(torch_compile_dynamic_true)=
+### `torch.compile (dynamic=true)` (Not recommended)
+
+This setting forces all sizes and integers to be dynamic, increasing the
+chance of encountering dynamic shape bugs. Setting this option is not
+recommended due to it  being error prone.
+It would make every input size dynamic which may result it performance
+regressions and ultimately increase compilation time.
+
+PyTorch also provides advanced control options for dynamic shapes, see:
+{ref}`dynamic_shapes_advanced_control_options`.
+
+## Where Do I Go From Here?
+
+If you encounter a framework code bug or an issue with specialization,
+file an issue so it can be reviewed and potentially improved. If the issue
+is within your user code, consider whether you are willing to rewrite your
+code to avoid it. Determine if it affects correctness or if it's a redundant
+check. If the issue involves a Triton custom kernel with a `constexpr`
+argument, evaluate whether you can rewrite it to address the problem.
+
+```{toctree}
+:maxdepth: 1
+compile/dynamic_shapes_core_concepts
+compile/dynamic_shapes_troubleshooting
+compile/dynamic_shapes_advanced_control_options
+compile/dynamic_shapes_beyond_the_basics
+```
+
+```{seealso}
+* [tlparse documentation](https://github.com/pytorch/tlparse)
+* [The dynamic shapes manual](https://docs.google.com/document/d/1GgvOe7C8_NVOMLOCwDaYV1mXXyHMXY7ExoewHqooxrs/edit?tab=t.0#heading=h.fh8zzonyw8ng)
+```
--- a/docs/source/utils.md
+++ b/docs/source/utils.md
@ -78,7 +78,6 @@ for tracking purposes -->
 .. py:module:: torch.utils.data.graph
 .. py:module:: torch.utils.data.graph_settings
 .. py:module:: torch.utils.data.sampler
-.. py:module:: torch.utils.debug_mode
 .. py:module:: torch.utils.dlpack
 .. py:module:: torch.utils.file_baton
 .. py:module:: torch.utils.flop_counter
--- a/Show More
+++ b/Show More