Revert all changes to torch/cuda/_utils.py

testy test
simplify nvrtc discovery logic
2025-10-29 19:24:55 +08:00 · 2025-09-17 13:16:34 -07:00 · 2025-09-17 13:15:06 -07:00 · 2025-09-17 13:15:06 -07:00 · 2025-09-17 13:15:05 -07:00 · 2025-09-17 13:15:05 -07:00
219 changed files with 2671 additions and 4791 deletions
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@ -31,7 +31,8 @@ pip install -r /pytorch/requirements.txt
 pip install auditwheel==6.2.0 wheel
 if [ "$DESIRED_CUDA" = "cpu" ]; then
    echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
-    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
+    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
+    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
 else
    echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
    export USE_SYSTEM_NCCL=1
@ -45,5 +46,6 @@ else
        export USE_NVIDIA_PYPI_LIBS=1
    fi

-    python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
+    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
+    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
 fi
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -317,7 +317,7 @@ if __name__ == "__main__":
    ).decode()

    print("Building PyTorch wheel")
-    build_vars = ""
+    build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
    # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
    if enable_cuda:
        build_vars += "MAX_JOBS=5 "
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-e0dda9059d082537cee36be6c5e4fe3b18c880c0
+56392aa978594cc155fa8af48cd949f5b5f1823a
--- a/.ci/docker/ci_commit_pins/huggingface-requirements.txt
+++ b/.ci/docker/ci_commit_pins/huggingface-requirements.txt
@ -1,2 +1,2 @@
-transformers==4.56.0
+transformers==4.54.0
 soxr==0.5.0
--- a/.ci/docker/common/install_executorch.sh
+++ b/.ci/docker/common/install_executorch.sh
@ -42,27 +42,22 @@ install_pip_dependencies() {
  # A workaround, ExecuTorch has moved to numpy 2.0 which is not compatible with the current
  # numba and scipy version used in PyTorch CI
  conda_run pip uninstall -y numba scipy
-  # Yaspin is needed for running CI test (get_benchmark_analysis_data.py)
-  pip_install yaspin==3.1.0

  popd
 }

 setup_executorch() {
+  pushd executorch
+
  export PYTHON_EXECUTABLE=python
-  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON -DEXECUTORCH_BUILD_TESTS=ON"
+  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"

  as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true
+  popd
 }

-if [ $# -eq 0 ]; then
-  clone_executorch
-  install_buck2
-  install_conda_dependencies
-  install_pip_dependencies
-  pushd executorch
-  setup_executorch
-  popd
-else
-  "$@"
-fi
+clone_executorch
+install_buck2
+install_conda_dependencies
+install_pip_dependencies
+setup_executorch
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,24 +1,23 @@
-sphinx==6.2.1
+sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
-#Pinned versions: 7.2.6
-
+#Pinned versions: 5.3.0
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2

-sphinx-remove-toctrees==1.0.0.post1
-#Description: This is used to generate PyTorch docs
-#Pinned versions: 1.0.0.post1
-
 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
 # something related to Docker setup. We can investigate this later.

-sphinxcontrib.katex==0.9.10
+sphinxcontrib.katex==0.8.6
 #Description: This is used to generate PyTorch docs
-#Pinned versions: 0.9.10
+#Pinned versions: 0.8.6

-sphinx_sitemap==2.7.1
+sphinxext-opengraph==0.9.1
+#Description: This is used to generate PyTorch docs
+#Pinned versions: 0.9.1
+
+sphinx_sitemap==2.6.0
 #Description: This is used to generate sitemap for PyTorch docs
-#Pinned versions: 2.7.1
+#Pinned versions: 2.6.0

 matplotlib==3.5.3 ; python_version < "3.13"
 matplotlib==3.6.3 ; python_version >= "3.13"
@ -30,17 +29,17 @@ tensorboard==2.18.0 ; python_version >= "3.13"
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 2.13.0

-breathe==4.35.0
+breathe==4.34.0
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 4.35.0
+#Pinned versions: 4.34.0

-exhale==0.3.7
+exhale==0.2.3
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 0.3.7
+#Pinned versions: 0.2.3

-docutils==0.18.1
+docutils==0.16
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 0.18.1
+#Pinned versions: 0.16

 bs4==0.0.1
 #Description: This is used to generate PyTorch C++ docs
@ -50,24 +49,13 @@ IPython==8.12.0
 #Description: This is used to generate PyTorch functorch docs
 #Pinned versions: 8.12.0

-<<<<<<< HEAD
 myst-nb==0.17.2
 #Description: This is used to generate PyTorch functorch and torch.compile docs.
 #Pinned versions: 0.17.2
-=======
-myst-nb==1.2.0
-#Description: This is used to generate PyTorch functorch docs
-#Pinned versions: 1.2.0
->>>>>>> 195382ce28e (Update)

 # The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
 python-etcd==0.4.5
-sphinx-copybutton==0.5.2
-sphinx-design==0.6.1
+sphinx-copybutton==0.5.0
+sphinx-design==0.4.0
 sphinxcontrib-mermaid==1.0.0
-<<<<<<< HEAD
 myst-parser==0.18.1
-=======
-myst-parser==3.0.1
-myst-nb
->>>>>>> 195382ce28e (Update)
--- a/.ci/pytorch/python_doc_push_script.sh
+++ b/.ci/pytorch/python_doc_push_script.sh
@ -83,10 +83,6 @@ rm -rf pytorch || true
 pushd "$pt_checkout"
 pushd docs

-# Profile the docs build to see what is taking the longest
-python -m cProfile -o docs_build.prof -m sphinx.cmd.build -b html -d build/doctrees source build/html
-python -c "import pstats; p = pstats.Stats('docs_build.prof'); p.sort_stats('cumtime').print_stats(50)"
-
 # Build the docs
 if [ "$is_main_doc" = true ]; then
  build_docs html || exit $?
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -1550,10 +1550,14 @@ test_executorch() {
  install_torchvision
  install_torchaudio

-  INSTALL_SCRIPT="$(pwd)/.ci/docker/common/install_executorch.sh"
-
  pushd /executorch
-  "${INSTALL_SCRIPT}" setup_executorch
+
+  export PYTHON_EXECUTABLE=python
+  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
+
+  # NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch
+  # from the PR
+  bash .ci/scripts/setup-linux.sh --build-tool cmake

  echo "Run ExecuTorch unit tests"
  pytest -v -n auto
@ -1567,6 +1571,10 @@ test_executorch() {

  popd

+  # Test torchgen generated code for Executorch.
+  echo "Testing ExecuTorch op registration"
+  "$BUILD_BIN_DIR"/test_edge_op_registration
+
  assert_git_not_dirty
 }

--- a/.github/actions/reuse-old-whl/reuse_old_whl.py
+++ b/.github/actions/reuse-old-whl/reuse_old_whl.py
@ -264,7 +264,7 @@ def unzip_artifact_and_replace_files() -> None:
        change_content_to_new_version(f"artifacts/dist/{old_stem}/torch/version.py")

        for file in Path(f"artifacts/dist/{old_stem}").glob(
-            "*.dist-info/*",
+            "*.dist-info/**",
        ):
            change_content_to_new_version(file)

--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-9d1c50a5ac8726f4af0d4a4e85ad4d26a674ad26
+d119fc86140785e7efc8f125c17153544d1e0f20
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -130,6 +130,3 @@
 - torch/csrc/inductor/aoti_include/**
 - torchgen/aoti/**
 - torchgen/gen_aoti_c_shim.py
-
-"ciflow/vllm":
- .github/ci_commit_pins/vllm.txt
--- a/.github/workflows/_docs.yml
+++ b/.github/workflows/_docs.yml
@ -74,11 +74,7 @@ jobs:
          - docs_type: python
            runner: ${{ inputs.runner_prefix }}linux.2xlarge
            # It takes less than 30m to finish python docs unless there are issues
-            timeout-minutes: 60
-          - docs_type: functorch
-            runner: ${{ inputs.runner_prefix }}linux.2xlarge
-            # It takes less than 15m to finish functorch docs unless there are issues
-            timeout-minutes: 15
+            timeout-minutes: 30
    # Set a fixed name for this job instead of using the current matrix-generated name, i.e. build-docs (cpp, linux.12xlarge, 180)
    # The current name requires updating the database last docs push query from test-infra every time the matrix is updated
    name: build-docs-${{ matrix.docs_type }}-${{ inputs.push }}
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -71,7 +71,8 @@ jobs:
          pytorch-linux-jammy-py3-clang12-onnx,
          pytorch-linux-jammy-linter,
          pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
-          pytorch-linux-jammy-py3-clang12-executorch,
+          # Executorch pin needs update
+          # pytorch-linux-jammy-py3-clang12-executorch,
          pytorch-linux-jammy-py3.12-triton-cpu,
          pytorch-linux-noble-riscv64-py3.12-gcc14
        ]
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -318,6 +318,32 @@ jobs:
        ]}
    secrets: inherit

+  linux-jammy-py3-clang12-executorch-build:
+    if: false  # Docker build needs pin update
+    name: linux-jammy-py3-clang12-executorch
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-py3-clang12-executorch
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch
+      test-matrix: |
+        { include: [
+          { config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-py3-clang12-executorch-test:
+    name: linux-jammy-py3-clang12-executorch
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-py3-clang12-executorch-build
+    if: false # Has been broken for a while
+    with:
+      build-environment: linux-jammy-py3-clang12-executorch
+      docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
+    secrets: inherit
+
  linux-jammy-cuda12_8-py3_10-gcc9-inductor-build:
    name: cuda12.8-py3.10-gcc9-sm75
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -259,27 +259,3 @@ jobs:
      docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }}
      test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }}
    secrets: inherit
-
-  linux-jammy-py3-clang12-executorch-build:
-    name: linux-jammy-py3-clang12-executorch
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3-clang12-executorch
-      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-executorch
-      test-matrix: |
-        { include: [
-          { config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-        ]}
-    secrets: inherit
-
-  linux-jammy-py3-clang12-executorch-test:
-    name: linux-jammy-py3-clang12-executorch
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-py3-clang12-executorch-build
-    with:
-      build-environment: linux-jammy-py3-clang12-executorch
-      docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
-    secrets: inherit
--- a/.github/workflows/vllm.yml
+++ b/.github/workflows/vllm.yml
@ -36,8 +36,6 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      # When building vLLM, uv doesn't like that we rename wheel without changing the wheel metadata
-      allow-reuse-old-whl: false
      build-additional-packages: "vision audio"
      build-external-packages: "vllm"
      build-environment: linux-jammy-cuda12.8-py3.12-gcc11
--- a/.gitignore
+++ b/.gitignore
@ -259,9 +259,6 @@ gen
 .pytest_cache
 aten/build/*

-# Linker scripts for prioritized text optimization
-cmake/linker_script.ld
-
 # Bram
 plsdontbreak

--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -964,6 +964,7 @@ exclude_patterns = [
    'test/jit/**',  # should be run through test/test_jit.py
    'test/ao/sparsity/**',  # should be run through test/test_ao_sparsity.py
    'test/fx/**',  # should be run through test/test_fx.py
+    'test/bottleneck_test/**',  # excluded by test/run_test.py
    'test/package/**',  # excluded by test/run_test.py
    'test/distributed/argparse_util_test.py',
    'test/distributed/bin/test_script.py',
@ -1409,6 +1410,8 @@ exclude_patterns = [
    'torch/utils/benchmark/utils/timer.py',
    'torch/utils/benchmark/utils/valgrind_wrapper/__init__.py',
    'torch/utils/benchmark/utils/valgrind_wrapper/timer_interface.py',
+    'torch/utils/bottleneck/__init__.py',
+    'torch/utils/bottleneck/__main__.py',
    'torch/utils/bundled_inputs.py',
    'torch/utils/checkpoint.py',
    'torch/utils/collect_env.py',
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -380,13 +380,6 @@ cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler"
                       OFF "USE_CUDA" OFF)
 cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON
                        "CPU_AARCH64" OFF)
-# prioritized text linker, ON by default for AArch64+Linux, option visible to all AArch64, x86 and ppc64le.
-set(USE_PRIORITIZED_TEXT_DEFAULT OFF)
-if(LINUX AND CPU_AARCH64)
-  set(USE_PRIORITIZED_TEXT_DEFAULT ON)
-endif()
-cmake_dependent_option(USE_PRIORITIZED_TEXT_FOR_LD "Use prioritized text linker for ld."
-  "${USE_PRIORITIZED_TEXT_DEFAULT}" "CPU_INTEL OR CPU_AARCH64 OR CPU_POWER" OFF)

 option(USE_MIMALLOC "Use mimalloc" OFF)
 # Enable third party mimalloc library to improve memory allocation performance
@ -664,11 +657,6 @@ endif(MSVC)

 string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all")

-# Set linker max-page-size to 64KiB on AArch64 Linux
-if(LINUX AND CPU_AARCH64)
-  add_link_options_if_supported("-z,max-page-size=0x10000")
-endif()
-
 # Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not
 # applicable to mobile are disabled by this variable. Setting
 # `BUILD_PYTORCH_MOBILE_WITH_HOST_TOOLCHAIN` environment variable can force it
@ -1433,57 +1421,3 @@ if(BUILD_BUNDLE_PTXAS AND USE_CUDA)
  install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas"
          DESTINATION "${CMAKE_INSTALL_BINDIR}")
 endif()
-
-if(USE_PRIORITIZED_TEXT_FOR_LD)
-  add_compile_options(
-    $<$<COMPILE_LANGUAGE:C,CXX>:-ffunction-sections>
-    $<$<COMPILE_LANGUAGE:C,CXX>:-fdata-sections>
-  )
-  set(LINKER_SCRIPT_FILE_OUT "${CMAKE_SOURCE_DIR}/cmake/linker_script.ld")
-  set(LINKER_SCRIPT_FILE_IN "${CMAKE_SOURCE_DIR}/cmake/prioritized_text.txt")
-
-  add_custom_command(
-    OUTPUT "${LINKER_SCRIPT_FILE_OUT}"
-    COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py --filein "${LINKER_SCRIPT_FILE_IN}" --fout "${LINKER_SCRIPT_FILE_OUT}"
-    DEPENDS ${CMAKE_SOURCE_DIR}/tools/setup_helpers/generate_linker_script.py "${LINKER_SCRIPT_FILE_IN}"
-    COMMENT "Generating prioritized text linker files"
-    VERBATIM
-  )
-
-  add_custom_target(generate_linker_script DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
-
-  if(BUILD_PYTHON)
-    set(LINKER_OPT_TARGETS torch_python)
-  endif()
-
-  if(NOT BUILD_LIBTORCHLESS)
-    list(APPEND LINKER_OPT_TARGETS torch_cpu c10)
-    if(USE_CUDA)
-      list(APPEND LINKER_OPT_TARGETS torch_cuda c10_cuda)
-    endif()
-    if(USE_XPU)
-      list(APPEND LINKER_OPT_TARGETS torch_xpu c10_xpu)
-    endif()
-    if(USE_ROCM)
-      list(APPEND LINKER_OPT_TARGETS torch_hip c10_hip)
-    endif()
-  endif()
-
-  foreach(tgt IN LISTS LINKER_OPT_TARGETS)
-    if(TARGET ${tgt})
-      add_dependencies("${tgt}" generate_linker_script)
-      target_link_options_if_supported(${tgt} "-T,${LINKER_SCRIPT_FILE_OUT}")
-      set_property(TARGET ${tgt} APPEND PROPERTY LINK_DEPENDS "${LINKER_SCRIPT_FILE_OUT}")
-    else()
-       message(WARNING "Requested target '${tgt}' for linker script optimization was not found.")
-    endif()
-  endforeach()
-
-else()
-  if(LINUX AND CPU_AARCH64)
-    message(WARNING [[
-    It is strongly recommend to enable linker script optimization for all AArch64 Linux builds.
-    To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
-    ]])
-  endif()
-endif()
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -180,7 +180,7 @@ void Context::setUserEnabledNNPACK(bool e) {
 }

 bool Context::allowTF32CuDNN(const std::string& op) const {
-  if (op.empty()){
+  if (op.size() == 0){
    bool allow_tf32_rnn = float32Precision("cuda", "rnn") == "tf32";
    bool allow_tf32_conv = float32Precision("cuda", "conv") == "tf32";
    TORCH_CHECK(
@ -281,6 +281,9 @@ bool Context::userEnabledOverrideableSDP() const {

 static constexpr const auto cublas_config_var_name = "CUBLAS_WORKSPACE_CONFIG";
 static constexpr const std::array<const char*, 2> cublas_deterministic_configs = {":4096:8", ":16:8"};
+#ifdef USE_ROCM
+static constexpr const auto hipblaslt_allow_tf32 = "HIPBLASLT_ALLOW_TF32";
+#endif

 bool Context::checkCuBLASConfigDeterministic() {
  // If using CUDA 10.2 or greater, need to make sure CuBLAS workspace config
@ -340,6 +343,12 @@ void Context::setImmediateMiopen(bool b) {
 }

 bool Context::allowTF32CuBLAS() const {
+#ifdef USE_ROCM
+    const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
+    if (allow_tf32 != true) {
+      return false;
+    }
+#endif
  bool legacy_allow_tf32 = float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST;
  bool allow_tf32_new = float32Precision("cuda", "matmul") == "tf32";
  TORCH_CHECK(
@ -353,6 +362,14 @@ bool Context::allowTF32CuBLAS() const {
 }

 void Context::setAllowTF32CuBLAS(bool b) {
+#ifdef USE_ROCM
+  const auto allow_tf32 = c10::utils::check_env(hipblaslt_allow_tf32);
+  if (allow_tf32 != true) {
+    C10_LOG_FIRST_N(INFO, 10) << "torch.backends.cuda.matmul.allow_tf32 is not supported on ROCm by default. "
+                              << "Please set environment variable HIPBLASLT_ALLOW_TF32=1 to enable it.";
+    return;
+  }
+#endif
  float32_matmul_precision = b ? at::Float32MatmulPrecision::HIGH : at::Float32MatmulPrecision::HIGHEST;
  setFloat32Precision("cuda", "matmul", b ? "tf32" : "ieee");
 }
@ -426,7 +443,7 @@ void Context::setFloat32Precision(const std::string& backend, const std::string&
    std::string msg;
    auto iterp = _fp32_precisions.find(backend);
    TORCH_CHECK(iterp != _fp32_precisions.end());
-    for (const auto& p : iterp->second) {
+    for (auto p : iterp->second) {
      msg += p;
      msg += " ";
    }
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@ -133,7 +133,7 @@ FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const
  : c10::TensorImpl(
      c10::DispatchKeySet(DispatchKey::Functionalize),
      view_value.dtype(),
-      base->storage().data_ptr().device()
+      view_value.device()
    ),
    value_(view_value),
    is_multi_output_view_(base->is_multi_output_view_ || meta.is_multi_output),
@ -485,10 +485,7 @@ void FunctionalTensorWrapper::shallow_copy_from(const c10::intrusive_ptr<TensorI


 c10::Device FunctionalTensorWrapper::device_custom() const {
-  // The storage pointer already uses the underlying tensor custom device (if
-  // applicable) to extract the device. So, we dont have to recurse again by
-  // doing value_.unsafeGetTensorImpl()->device().
-  return storage().data_ptr().device();
+  return value_.unsafeGetTensorImpl()->device();
 }
 at::IntArrayRef FunctionalTensorWrapper::sizes_custom() const {
  return value_.unsafeGetTensorImpl()->sizes();
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -1954,8 +1954,8 @@ void scaled_gemm(
  #if ROCM_VERSION >= 70000
            if (at::detail::getCUDAHooks().isGPUArch({"gfx950"})) {
                // TODO: add constraints based on hipblaslt internals
-                TORCH_CHECK((m % 16 == 0) && (n % 16 == 0) && (k % 128 == 0),
-                           "M, N must be multiples of 16 and K should be multiple of 128 for MX format. "
+                TORCH_CHECK((m % 32 == 0) && (n % 32 == 0) && (k % 32 == 0),
+                           "Matrix dimensions must be multiples of 32 for MX format. "
                           "Got m=", m, ", n=", n, ", k=", k);
            }
  #endif
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -1138,14 +1138,9 @@ bool is_blockwise_1x16_scaling(const at::Tensor& t, const at::Tensor& scale) {
 bool is_blockwise_1x32_scaling(const at::Tensor& t, const at::Tensor& scale) {
  // TODO: We might want to enforce some structure on the shapes of the scale
  // tensors
-  bool is_fp8_path = (isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat8_e8m0fnu
-      && scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1), 32), 4));
-  bool is_packed_fp4_path = false;
-#ifdef USE_ROCM
-  is_packed_fp4_path = (t.scalar_type() == ScalarType::Float4_e2m1fn_x2 && scale.scalar_type() == at::kFloat8_e8m0fnu
-      && scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1) * 2, 32), 4));
-#endif
-  return (is_fp8_path || is_packed_fp4_path) && scale.is_contiguous();
+  return (isFloat8Type(t.scalar_type()) && scale.scalar_type() == at::kFloat8_e8m0fnu
+      && scale.numel() == round_up<int64_t>(t.size(0), 128) * round_up<int64_t>(ceil_div<int64_t>(t.size(1), 32), 4)
+      && scale.is_contiguous());
 }

 bool is_blockwise_1x128_scaling(const at::Tensor& t, const at::Tensor& scale) {
@ -1386,15 +1381,9 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
    TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
                "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");

-    int packed_factor = 1;
-    if (mat1.scalar_type() == ScalarType::Float4_e2m1fn_x2) {
-      // For float4 data type, each byte stores two 4-bit floating-point values,
-      // effectively packing two elements into one byte.
-      packed_factor = 2;
-    }
-    TORCH_CHECK(mat1.size(0) % 16 == 0 && (mat1.size(1) * packed_factor) % 128 == 0 &&
-                mat2.size(1) % 16 == 0,
-                "M, N must be multiples of 16 and K must be multiple of 128 for block-wise scaling");
+    TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 &&
+                mat2.size(0) % 32 == 0 && mat2.size(1) % 32 == 0,
+                "Matrix dimensions must be multiples of 32 for block-wise scaling");

    TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 ||
                out.scalar_type() == ScalarType::Half,
--- a/aten/src/ATen/native/cuda/Shape.cu
+++ b/aten/src/ATen/native/cuda/Shape.cu
@ -226,38 +226,6 @@ __global__ void CatArrayBatchedCopy_contig(
    }
 }

-
-template <typename T, typename IndexType, int Dims, int batch_size, int stride_size, int alignment, int elems_per_vec>
-__global__ void CatArrayBatchedCopy_vectorized(
-    char* output,
-    CatArrInputTensorMetadata<T, IndexType, batch_size, stride_size> inputs,
-    TensorSizeStride<IndexType, CAT_ARRAY_MAX_INPUT_DIMS> os,
-    const int concatDim,
-    IndexType trailingSize) {
-
-    IndexType tid = blockIdx.x * blockDim.x + threadIdx.x;
-    IndexType nElements = inputs.nElements[blockIdx.y] / elems_per_vec;
-
-    if(tid >= nElements) return;
-
-    const char * data = (char*)inputs.input[blockIdx.y];
-    IndexType offset = inputs.offset[blockIdx.y] * trailingSize / elems_per_vec;
-    IndexType dimSize = inputs.dimSize[blockIdx.y] * trailingSize / elems_per_vec;
-    int64_t dataOffset = (int64_t)offset  * alignment; // in bytes
-
-    IndexType stride = gridDim.x * blockDim.x;
-
-    while( tid < nElements){
-      int64_t elementOffset = (int64_t)CatArrIndexToOffset<IndexType, Dims>::compute(
-                    os.tensorSize, os.tensorStride, dimSize, concatDim, tid) * alignment; // in bytes
-      auto vec = at::native::memory::ld_vec<alignment>(data + (int64_t)alignment * tid);
-      at::native::memory::st_vec<alignment>(output + dataOffset + elementOffset, vec);
-      tid += stride;
-    }
-}
-
-
-
 /*
  Specialized implementation of the CatArrayBatchedCopy written to generate wide memory loads
  to improve memory bandwidth throughput.
@ -328,27 +296,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
  scalar_t *data = (scalar_t *)(out.mutable_data_ptr());
  CatArrInputTensorMetadata<scalar_t, unsigned int, batch_size, stride_size> catMetaData;
  TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> outputParam;
-  // If all batches are contiguous we can call a specialized implementation
-  // which requires the input tensor addresses to be aligned to a
-  // 16 Byte boundary.
-
-  constexpr bool isContig = stride_size == 1;
-  bool isAligned = true;
-  constexpr int alignment = 16;

  // Next, let's initialize the size, stride arrays for the output Tensor.
-  // for contig case, we'll canonicalize output strides, so that
-  // we don't have arbitrary strides for dims of size 0
-  size_t stride0 = 1;
  if (memory_format == c10::MemoryFormat::Contiguous) {
-    for (int i = nDims - 1; i >= 0; --i) {
+    for (int i = 0; i < nDims; ++i) {
      outputParam.tensorSize[i] = out.size(i);
-      if (isContig) {
-        outputParam.tensorStride[i] = stride0;
-        stride0 *= out.size(i);
-      } else {
-        outputParam.tensorStride[i] = out.stride(i);
-      }
+      outputParam.tensorStride[i] = out.stride(i);
    }
  } else if (memory_format == c10::MemoryFormat::ChannelsLast || memory_format == c10::MemoryFormat::ChannelsLast3d) {
    // permute the semantics of dims from NCHW to NHWC so that the input
@ -367,15 +320,12 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i

  at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream();

+  // If all batches are contiguous we can call a specialized implementation
+  // which requires the input tensor addresses to be aligned to a
+  // 16 Byte boundary.

-  // for channels last computing slice size correctly is much more involved, so we never send it
-  // on the fully vectorized path
-  // we need output stride in cat dimension to be multiple of alignment,
-  // if we ever use it to compute offsets
-  // for catting in 0th dimension it doesn't matter
-  bool isInOutAligned = isContig && at::native::memory::get_alignment(data) >= alignment &&
-                        memory_format == c10::MemoryFormat::Contiguous && (dimension == 0 ||
-                        outputParam.tensorStride[dimension - 1] * sizeof(scalar_t) % alignment == 0);
+  bool isContig = true;
+  bool isAligned = true;
  unsigned int max_elements_per_tensor = 0;

  // Now we loop
@ -391,16 +341,6 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
      // high-dimensional tensor
      if (inputs[i+batchCounter].get().numel() > 0) {
        dimSize = inputs[i+batchCounter].get().size(dimension);
-        if (isInOutAligned) {
-          auto t = inputs[i+batchCounter].get();
-          // similarly to output stride, we cannot trust stride value to
-          // determine slice size if the corresponding dimension is 1
-          // we have to multiply all the subsequent sizes
-          int64_t slice_size = dimension == 0 ? t.numel() : t.sizes()[dimension - 1] != 1 ?
-             t.strides()[dimension - 1] : c10::multiply_integers(t.sizes().begin() + dimension, t.sizes().end());
-          slice_size *= sizeof(scalar_t);
-          isInOutAligned &= (slice_size % alignment == 0);
-        }
      }

      catMetaData.input[batchCounter] = (scalar_t*)(inputs[i+batchCounter].get().const_data_ptr());
@ -411,12 +351,10 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
 #ifdef USE_ROCM
      // On ROCm, CatArrayBatchedCopy_contig is faster
      isAligned = false;
-      isInOutAligned = false;
 #else
      // If at least one of the inputs is not aligned, we can't call the
      // CatArrayBatchedCopy_alignedK_contig
      isAligned &= is_aligned_vec4(catMetaData.input[batchCounter]);
-      isInOutAligned &= at::native::memory::get_alignment(catMetaData.input[batchCounter]) >= alignment;
 #endif

      if (stride_size > 1) {
@ -427,6 +365,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
          catMetaData.tensorStride[batchCounter].tensorStride[j] = strides[j];
        }
        catMetaData.isContiguous[batchCounter] = false;
+        isContig = false;
      } else {
        catMetaData.isContiguous[batchCounter] = true;
      }
@ -449,13 +388,10 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
          max_elements_per_tensor, batchCounter);
 #else
    dim3 applyBlock, catGrid;
-    if (isInOutAligned) {
-      std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, alignment>(
-        max_elements_per_tensor, batchCounter);
-    } else if (isContig && isAligned && sizeof(scalar_t) > 2) {
+    if (isContig && sizeof(scalar_t) > 2) {
      std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_16>(
          max_elements_per_tensor, batchCounter);
-    } else if (isContig && isAligned && sizeof(scalar_t) == 2) {
+    } else if (isContig && sizeof(scalar_t) == 2) {
      std::tie(catGrid, applyBlock) = getCatGridContig<scalar_t, ALIGNED_VEC_LOAD_BYTES_8>(
          max_elements_per_tensor, batchCounter);
    } else {
@ -463,30 +399,6 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
      getCatGrid(batchCounter, catGrid);
    }
 #endif
-    int32_t trailingSize;
-    TensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> kernelOutputParam;
-    if (isInOutAligned) {
-      // in this case we can and should flatten the tensors after the cat dim
-      // we want to view the tensors as if consisting of `alignment`-sized elements
-      // however, we might not be able to cleanly divide just the last dim -
-      // it might not be the multiple of alignment.
-      // however, we know that the full concatted slice is multiple of alignment,
-      // so if we flatten all the dims after and including concat dim,
-      // it will be divisible by alignment
-      // then we need to divide last out size by elems_per_vec,
-      // and divide all strides except last by elems_per_vec (last stride is 1 always)
-      // for input, we will fix up the sizes and strides in the kernel directly
-      kernelOutputParam = outputParam;
-      nDims = dimension + 1;
-      constexpr auto elems_per_vec = alignment / sizeof(scalar_t);
-      auto out_size = dimension == 0 ? out.numel() : kernelOutputParam.tensorStride[dimension-1];
-      kernelOutputParam.tensorSize[dimension] = out_size / elems_per_vec;
-      trailingSize = outputParam.tensorStride[dimension];
-      kernelOutputParam.tensorStride[dimension] = 1;
-      for (int i = 0; i < dimension; ++i) {
-        kernelOutputParam.tensorStride[i] /= elems_per_vec;
-      }
-    }

    if (memory_format != c10::MemoryFormat::Contiguous) {
      switch (dimension) {
@ -501,12 +413,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i
    }
    // Template Declarations for dim = 1, 2, 3, 4
 #define HANDLE_CASE(DIMS) \
-    if (isInOutAligned) {\
-      constexpr auto elems_per_vec = alignment / sizeof(scalar_t); \
-      CatArrayBatchedCopy_vectorized<scalar_t, unsigned int, DIMS, batch_size, stride_size, alignment, elems_per_vec><<<\
-      catGrid, applyBlock, 0, stream.stream()>>>(\
-        (char*)data, catMetaData, kernelOutputParam, dimension, trailingSize);\
-    } else if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
+    if (isContig && isAligned && sizeof(scalar_t) > 2 && sizeof(scalar_t) <= 8) {\
      CatArrayBatchedCopy_alignedK_contig<scalar_t, unsigned int, DIMS, batch_size, stride_size, ALIGNED_VEC_LOAD_BYTES_16><<<\
          catGrid, applyBlock, 0, stream.stream()>>>(\
              data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\
--- a/aten/src/ATen/native/mkldnn/xpu/Blas.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/Blas.cpp
@ -559,60 +559,4 @@ Tensor _int_mm_xpu(const Tensor& self, const Tensor& mat2) {
      at::empty({self.size(0), mat2.size(1)}, self.options().dtype(at::kInt));
  return _int_mm_out_xpu(self, mat2, result);
 }
-
-Tensor _weight_int8pack_mm_xpu(
-    const Tensor& A,
-    const Tensor& B,
-    const Tensor& scales) {
-  auto M = A.size(0);
-  auto N = B.size(0);
-  auto K = A.size(1);
-
-  TORCH_CHECK(
-      A.dtype() == kBFloat16 || A.dtype() == kHalf || A.dtype() == kFloat,
-      " : expect A to be either 32-bit or 16-bit float tensor.");
-  TORCH_CHECK(A.dim() == 2, __func__, " : expect A to be 2D tensor.");
-  TORCH_CHECK(
-      A.stride(1) == 1, " : A must be contiguous on the last dimension.");
-  TORCH_CHECK(B.dtype() == kChar, " : expect B to be int8 tensor.");
-  TORCH_CHECK(B.is_contiguous(), " : expect B to be contiguous.");
-  TORCH_CHECK(B.size(1) == K, " : expect B.size(1) == ", K);
-
-  TORCH_CHECK(
-      scales.dim() == 1 && scales.size(0) == N,
-      " : expect scales to be 1d tensor with size ",
-      N);
-
-  auto C = at::empty({M, N}, A.options());
-
-  // --- Launch kernel ---
-  Tensor bias = at::Tensor();
-  Tensor mat2_zero_points = at::Tensor();
-  Tensor non_const_scales = scales;
-  auto post_op_args = torch::List<std::optional<at::Scalar>>();
-
-  at::native::onednn::quantized_matmul(
-      A.contiguous(),
-      1.0,
-      0,
-      B,
-      non_const_scales,
-      mat2_zero_points,
-      bias,
-      C,
-      1.0,
-      0,
-      C.scalar_type(),
-      /*other*/ std::nullopt,
-      /*other scale*/ 1.0,
-      /*other zp*/ 0,
-      /*binary post op*/ "none",
-      /*binary alpha*/ 1.0,
-      /*post_op_name*/ "none",
-      post_op_args,
-      /*post_op_algorithm*/ "none",
-      /*m2_trans*/ false);
-
-  return C;
-}
 } // namespace at::native
--- a/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/detail/QMatmul.cpp
@ -110,9 +110,8 @@ void quantized_matmul(
  // [Note] Quantized Matrix Multiplication at XPU
  // The following code integrates oneDNN quantized gemm. The quantization
  // config we support:
-  // activation: s8, u8, fp16, bf16, fp32; per tensor calibrated;
-  // symmetric&asymmetric weight: s8; per_tensor/per_channel calibrated;
-  // symmetric
+  // activation: s8&u8; per tensor calibrated; symmetric&asymmetric
+  // weight: s8; per_tensor/per_channel calibrated; symmetric
  auto attr = Attr(static_cast<float>(1.0 / output_scale), output_zero_point);
  construct_attr_by_post_op(
      binary_post_op,
--- a/aten/src/ATen/native/mps/operations/Pooling.mm
+++ b/aten/src/ATen/native/mps/operations/Pooling.mm
@ -534,18 +534,6 @@ static void max_unpool_out_mps_template(const Tensor& input,
  output.resize_(output_size, memory_format);
  output.fill_(0);

-  if (indices.defined() && indices.numel() > 0) {
-    auto output_image_size = c10::multiply_integers(output_size_);
-
-    int64_t min_idx = indices.min().item<int64_t>();
-    int64_t max_idx = indices.max().item<int64_t>();
-
-    if (min_idx < 0 || max_idx >= output_image_size) {
-      int64_t error_idx = (min_idx < 0) ? min_idx : max_idx;
-      TORCH_CHECK(false, "Found an invalid max index: ", error_idx, " for output tensor of shape ", output_size_);
-    }
-  }
-
  id<MTLDevice> device = MPSDevice::getInstance()->device();
  MPSStream* mpsStream = getCurrentMPSStream();
  const auto numThreads = input.numel();
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -4243,7 +4243,6 @@
    CPU: _weight_int8pack_mm_cpu
    CUDA: _weight_int8pack_mm_cuda
    MPS: _weight_int8pack_mm_mps
-    XPU: _weight_int8pack_mm_xpu

 - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor
  python_module: sparse
@ -10849,7 +10848,6 @@
  dispatch:
    CompositeExplicitAutograd: foreach_tensor_clamp_min_scalar_kernel_slow_
    CUDA: foreach_tensor_clamp_min_scalar_kernel_cuda_
-    MTIA: foreach_tensor_maximum_scalar_kernel_mtia_
  autogen: _foreach_maximum.Scalar_out

 # foreach_minimum/maximum dispatches to clamp_max/min
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_huggingface_inference.csv
@ -174,11 +174,11 @@ YituTechConvBert,pass,0



-meta-llama/Llama-3.2-1B,pass,0
+meta-llama/Llama-3.2-1B,pass,5



-google/gemma-2-2b,pass,0
+google/gemma-2-2b,pass,5



@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass_due_to_skip,0



-openai/whisper-tiny,pass,0
+openai/whisper-tiny,pass,6



-Qwen/Qwen3-0.6B,pass,0
+Qwen/Qwen3-0.6B,pass,5
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0



-hf_Reformer,pass,5
+hf_Reformer,pass,8



@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0



-hf_T5_generate,pass,7
+hf_T5_generate,pass,11



--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0



-hf_Reformer,pass,20
+hf_Reformer,pass,25



--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_huggingface_inference.csv
@ -170,15 +170,15 @@ YituTechConvBert,pass,0



-meta-llama/Llama-3.2-1B,fail_to_run,0
+meta-llama/Llama-3.2-1B,fail_accuracy,0



-google/gemma-2-2b,fail_to_run,0
+google/gemma-2-2b,fail_accuracy,0



-google/gemma-3-4b-it,fail_to_run,0
+google/gemma-3-4b-it,fail_accuracy,0



@ -186,4 +186,4 @@ openai/whisper-tiny,fail_to_run,0



-Qwen/Qwen3-0.6B,fail_to_run,0
+Qwen/Qwen3-0.6B,fail_accuracy,0
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
@ -138,7 +138,7 @@ hf_Bert_large,pass,0



-hf_BigBird,pass,27
+hf_BigBird,pass,25



@ -158,7 +158,7 @@ hf_Longformer,pass,4



-hf_Reformer,pass,5
+hf_Reformer,pass,8



--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
@ -138,7 +138,7 @@ hf_Bert_large,pass,0



-hf_BigBird,pass,27
+hf_BigBird,pass,25



@ -158,7 +158,7 @@ hf_Longformer,pass,4



-hf_Reformer,pass,5
+hf_Reformer,pass,8



--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
@ -138,7 +138,7 @@ hf_Bert_large,pass,0



-hf_BigBird,pass,27
+hf_BigBird,pass,25



@ -158,7 +158,7 @@ hf_Longformer,pass,4



-hf_Reformer,pass,5
+hf_Reformer,pass,8



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_huggingface_inference.csv
@ -174,11 +174,11 @@ YituTechConvBert,pass,0



-meta-llama/Llama-3.2-1B,pass,0
+meta-llama/Llama-3.2-1B,pass,5



-google/gemma-2-2b,pass,0
+google/gemma-2-2b,pass,5



@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass_due_to_skip,0



-openai/whisper-tiny,pass,0
+openai/whisper-tiny,pass,6



-Qwen/Qwen3-0.6B,pass,0
+Qwen/Qwen3-0.6B,pass,5
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0



-hf_Reformer,pass,5
+hf_Reformer,pass,8



@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0



-hf_T5_generate,pass,7
+hf_T5_generate,pass,11



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0



-hf_Reformer,pass,20
+hf_Reformer,pass,25



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
@ -122,7 +122,7 @@ hf_Bert_large,pass,0



-hf_BigBird,pass,27
+hf_BigBird,pass,25



@ -142,7 +142,7 @@ hf_Longformer,pass,4



-hf_Reformer,pass,5
+hf_Reformer,pass,8



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_huggingface_inference.csv
@ -174,11 +174,11 @@ YituTechConvBert,pass,0



-meta-llama/Llama-3.2-1B,pass,0
+meta-llama/Llama-3.2-1B,pass,5



-google/gemma-2-2b,pass,0
+google/gemma-2-2b,pass,5



@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass,0



-openai/whisper-tiny,pass,0
+openai/whisper-tiny,pass,6



-Qwen/Qwen3-0.6B,pass,0
+Qwen/Qwen3-0.6B,pass,5
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0



-hf_Reformer,pass,5
+hf_Reformer,pass,8



@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0



-hf_T5_generate,pass,7
+hf_T5_generate,pass,11



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0



-hf_Reformer,pass,20
+hf_Reformer,pass,25



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_inference.csv
@ -174,11 +174,11 @@ YituTechConvBert,pass,0



-meta-llama/Llama-3.2-1B,pass,0
+meta-llama/Llama-3.2-1B,pass,5



-google/gemma-2-2b,pass,0
+google/gemma-2-2b,pass,5



@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass_due_to_skip,0



-openai/whisper-tiny,pass,0
+openai/whisper-tiny,pass,6



-Qwen/Qwen3-0.6B,pass,0
+Qwen/Qwen3-0.6B,pass,5
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_huggingface_training.csv
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5


 YituTechConvBert,pass,5
-
-
-
-meta-llama/Llama-3.2-1B,eager_fail_to_run,0
-
-
-
-google/gemma-2-2b,eager_fail_to_run,0
-
-
-
-google/gemma-3-4b-it,eager_fail_to_run,0
-
-
-
-openai/whisper-tiny,eager_fail_to_run,0
-
-
-
-Qwen/Qwen3-0.6B,eager_fail_to_run,0
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0



-hf_Reformer,pass,5
+hf_Reformer,pass,8



@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0



-hf_T5_generate,pass,7
+hf_T5_generate,pass,11



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0



-hf_Reformer,pass,20
+hf_Reformer,pass,25



--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_inference.csv
@ -174,11 +174,11 @@ YituTechConvBert,pass,0



-meta-llama/Llama-3.2-1B,pass,0
+meta-llama/Llama-3.2-1B,pass,5



-google/gemma-2-2b,pass,0
+google/gemma-2-2b,pass,5



@ -186,8 +186,8 @@ google/gemma-3-4b-it,pass_due_to_skip,0



-openai/whisper-tiny,pass,0
+openai/whisper-tiny,pass,6



-Qwen/Qwen3-0.6B,pass,0
+Qwen/Qwen3-0.6B,pass,5
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
@ -162,7 +162,7 @@ hf_GPT2_large,pass_due_to_skip,0



-hf_Reformer,pass,5
+hf_Reformer,pass,8



@ -178,7 +178,7 @@ hf_T5_base,eager_fail_to_run,0



-hf_T5_generate,pass,7
+hf_T5_generate,pass,11



--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
@ -110,7 +110,7 @@ hf_GPT2_large,pass_due_to_skip,0



-hf_Reformer,pass,20
+hf_Reformer,pass,25



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv
@ -181,7 +181,7 @@ hf_T5_base,pass,0



-hf_T5_generate,pass,7
+hf_T5_generate,pass,11



@ -205,7 +205,7 @@ llama,pass,0



-llama_v2_7b_16h,pass_due_to_skip,0
+llama_v2_7b_16h,model_fail_to_load,0



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_inductor_torchbench_inference.csv
@ -178,7 +178,7 @@ llama,fail_to_run,0



-llama_v2_7b_16h,pass_due_to_skip,0
+llama_v2_7b_16h,model_fail_to_load,0



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv
@ -181,7 +181,7 @@ hf_T5_base,pass,0



-hf_T5_generate,pass,7
+hf_T5_generate,pass,11



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv
@ -198,7 +198,7 @@ llama,pass,0



-llama_v2_7b_16h,pass_due_to_skip,0
+llama_v2_7b_16h,model_fail_to_load,0



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_huggingface_training.csv
@ -171,23 +171,3 @@ XLNetLMHeadModel,pass,5


 YituTechConvBert,pass,5
-
-
-
-meta-llama/Llama-3.2-1B,eager_failed_to_run,0
-
-
-
-google/gemma-2-2b,eager_failed_to_run,0
-
-
-
-google/gemma-3-4b-it,eager_failed_to_run,0
-
-
-
-openai/whisper-tiny,eager_failed_to_run,0
-
-
-
-Qwen/Qwen3-0.6B,eager_failed_to_run,0
--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv
@ -181,7 +181,7 @@ hf_T5_base,pass,0



-hf_T5_generate,pass,7
+hf_T5_generate,pass,11



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv
@ -198,7 +198,7 @@ llama,pass,0



-llama_v2_7b_16h,pass_due_to_skip,0
+llama_v2_7b_16h,model_fail_to_load,0



--- a/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
+++ b/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
@ -6,7 +6,7 @@ add_loop_eager_dynamic,compile_time_instruction_count,4432000000,0.1



-add_loop_inductor,compile_time_instruction_count,29660000000,0.1
+add_loop_inductor,compile_time_instruction_count,30280000000,0.1



@ -50,27 +50,27 @@ symint_sum_loop,compile_time_instruction_count,4299000000,0.1



-aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,1869000000,0.1
+aotdispatcher_inference_nosubclass_cpu,compile_time_instruction_count,2151000000,0.1



-aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,5281000000,0.1
+aotdispatcher_inference_subclass_cpu,compile_time_instruction_count,6124000000,0.1



-aotdispatcher_partitioner_cpu,compile_time_instruction_count,8333000000,0.1
+aotdispatcher_partitioner_cpu,compile_time_instruction_count,9005000000,0.1



-aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1909000000,0.1
+aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1989000000,0.1



-aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3442000000,0.1
+aotdispatcher_training_nosubclass_cpu,compile_time_instruction_count,3959000000,0.1



-aotdispatcher_training_subclass_cpu,compile_time_instruction_count,9239000000,0.1
+aotdispatcher_training_subclass_cpu,compile_time_instruction_count,10650000000,0.1



@ -78,7 +78,7 @@ mm_loop_inductor_gpu,compile_time_instruction_count,4820968837,0.1



-mm_loop_inductor_dynamic_gpu,compile_time_instruction_count,9051000000,0.1
+mm_loop_inductor_dynamic_gpu,compile_time_instruction_count,8802129167,0.1



@ -86,4 +86,4 @@ basic_NestedModule_eager,compile_time_instruction_count,9554000000,0.1



-basic_InlineMod_eager,compile_time_instruction_count,7618000000,0.1
+basic_InlineMod_eager,compile_time_instruction_count,7464000000,0.1
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@ -1998,21 +1998,7 @@ def define_buck_targets(
                    third_party("sleef_arm"),
                ],
            }),
-            compiler_flags = get_aten_compiler_flags() + select({
-                "DEFAULT": [],
-                "ovr_config//os:android-arm32": [
-                    "-mfpu=vfpv3-d16",
-                    "-march=armv7-a",
-                    "-mthumb",
-                    "-mfpu=neon",
-                ],
-                "ovr_config//os:android-x86_32": [
-                    "-mssse3",
-                ],
-                "ovr_config//os:android-x86_64": [
-                    "-mssse3",
-                ],
-            }),
+            compiler_flags = get_aten_compiler_flags(),
            exported_preprocessor_flags = get_aten_preprocessor_flags(),
            exported_deps = [
                ":aten_header",
--- a/c10/core/impl/DeviceGuardImplInterface.cpp
+++ b/c10/core/impl/DeviceGuardImplInterface.cpp
@ -1,5 +1,4 @@
 #include <c10/core/impl/DeviceGuardImplInterface.h>
-#include <c10/core/impl/FakeGuardImpl.h>
 #include <array>

 namespace c10::impl {
@ -15,26 +14,4 @@ DeviceGuardImplRegistrar::DeviceGuardImplRegistrar(
  device_guard_impl_registry[static_cast<size_t>(type)].store(impl);
 }

-namespace {
-thread_local std::unique_ptr<DeviceGuardImplInterface> tls_fake_device_guard =
-    nullptr;
-}
-
-void ensureCUDADeviceGuardSet() {
-  constexpr auto cuda_idx = static_cast<std::size_t>(DeviceType::CUDA);
-
-  const DeviceGuardImplInterface* p =
-      device_guard_impl_registry[cuda_idx].load();
-
-  // A non-null `ptr` indicates that the CUDA guard is already set up,
-  // implying this is using cuda build
-  if (p && p->deviceCount() == 0) {
-    // In following cases, we override CUDA guard interface with a no-op
-    // device guard. When p->deviceCount() == 0, cuda build is enabled, but no
-    // cuda devices available.
-    tls_fake_device_guard = std::make_unique<FakeGuardImpl<DeviceType::CUDA>>();
-    device_guard_impl_registry[cuda_idx].store(tls_fake_device_guard.get());
-  }
-}
-
 } // namespace c10::impl
--- a/c10/core/impl/DeviceGuardImplInterface.h
+++ b/c10/core/impl/DeviceGuardImplInterface.h
@ -6,7 +6,6 @@
 #include <c10/util/Exception.h>

 // Just for C10_ANONYMOUS_VARIABLE
-#include <c10/core/impl/TorchDispatchModeTLS.h>
 #include <c10/util/Registry.h>

 #include <array>
@ -252,7 +251,7 @@ struct C10_API DeviceGuardImplInterface {
 // for devices that don't actually have a concept of device index.  Prominent
 // examples are CPU and Meta.
 template <DeviceType D>
-struct NoOpDeviceGuardImpl : public DeviceGuardImplInterface {
+struct NoOpDeviceGuardImpl final : public DeviceGuardImplInterface {
  NoOpDeviceGuardImpl() = default;
  DeviceType type() const override {
    return D;
@ -372,7 +371,5 @@ inline bool hasDeviceGuardImpl(DeviceType type) {
  return device_guard_impl_registry[static_cast<size_t>(type)].load();
 }

-void C10_API ensureCUDADeviceGuardSet();
-
 } // namespace impl
 } // namespace c10
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@ -158,7 +158,6 @@ function(caffe2_print_configuration_summary)
  if(${USE_KLEIDIAI})
    message(STATUS "  USE_KLEIDIAI          : ${USE_KLEIDIAI}")
  endif()
-  message(STATUS "  USE_PRIORITIZED_TEXT_FOR_LD : ${USE_PRIORITIZED_TEXT_FOR_LD}")
  message(STATUS "  USE_UCC               : ${USE_UCC}")
  if(${USE_UCC})
    message(STATUS "    USE_SYSTEM_UCC        : ${USE_SYSTEM_UCC}")
--- a/cmake/public/utils.cmake
+++ b/cmake/public/utils.cmake
@ -482,7 +482,6 @@ function(torch_update_find_cuda_flags)
 endfunction()

 include(CheckCXXCompilerFlag)
-include(CheckLinkerFlag)

 ##############################################################################
 # CHeck if given flag is supported and append it to provided outputvar
@ -512,22 +511,3 @@ function(target_compile_options_if_supported target flag)
    target_compile_options(${target} PRIVATE ${flag})
  endif()
 endfunction()
-
-# Check if a global link option is supported
-function(add_link_options_if_supported flag)
-  check_linker_flag(C "LINKER:${flag}" _supported)
-  if("${_supported}")
-    add_link_options("LINKER:${flag}")
-  else()
-    message(WARNING "Attempted to use unsupported link option : ${flag}.")
-  endif()
-endfunction()
-
-function(target_link_options_if_supported tgt flag)
-  check_linker_flag(C "LINKER:${flag}" _supported)
-  if("${_supported}")
-    target_link_options("${tgt}" PRIVATE "LINKER:${flag}")
-  else()
-    message(WARNING "Attempted to use unsupported link option : ${flag}.")
-  endif()
-endfunction()
--- a/docs/source/_templates/autosummary/function.rst
+++ b/docs/source/_templates/autosummary/function.rst
@ -1,7 +0,0 @@
-.. role:: hidden
-    :class: hidden-section
-.. currentmodule:: {{ module }}
-
-{{ name | underline }}
-
-.. autofunction:: {{ fullname }}
--- a/docs/source/_templates/autosummary/method.rst
+++ b/docs/source/_templates/autosummary/method.rst
@ -1,7 +0,0 @@
-.. role:: hidden
-    :class: hidden-section
-.. currentmodule:: {{ module }}
-
-{{ name | underline }}
-
-.. automethod:: {{ fullname }}
--- a/docs/source/bottleneck.rst
+++ b/docs/source/bottleneck.rst
@ -0,0 +1,62 @@
+torch.utils.bottleneck
+======================
+
+.. automodule:: torch.utils.bottleneck
+.. currentmodule:: torch.utils.bottleneck
+
+`torch.utils.bottleneck` is a tool that can be used as an initial step for
+debugging bottlenecks in your program. It summarizes runs of your script with
+the Python profiler and PyTorch's autograd profiler.
+
+Run it on the command line with
+
+::
+
+    python -m torch.utils.bottleneck /path/to/source/script.py [args]
+
+where [args] are any number of arguments to `script.py`, or run
+``python -m torch.utils.bottleneck -h`` for more usage instructions.
+
+.. warning::
+    Because your script will be profiled, please ensure that it exits in a
+    finite amount of time.
+
+.. warning::
+    Due to the asynchronous nature of CUDA kernels, when running against
+    CUDA code, the cProfile output and CPU-mode autograd profilers may
+    not show correct timings: the reported CPU time reports the amount of time
+    used to launch the kernels but does not include the time the kernel
+    spent executing on a GPU unless the operation does a synchronize.
+    Ops that do synchronize appear to be extremely expensive under regular
+    CPU-mode profilers.
+    In these case where timings are incorrect, the CUDA-mode autograd profiler
+    may be helpful.
+
+.. note::
+    To decide which (CPU-only-mode or CUDA-mode) autograd profiler output to
+    look at, you should first check if your script is CPU-bound
+    ("CPU total time is much greater than CUDA total time").
+    If it is CPU-bound, looking at the results of the CPU-mode autograd
+    profiler will help. If on the other hand your script spends most of its
+    time executing on the GPU, then it makes sense to start
+    looking for responsible CUDA operators in the output of the CUDA-mode
+    autograd profiler.
+
+    Of course the reality is much more complicated and your script might not be
+    in one of those two extremes depending on the part of the model you're
+    evaluating. If the profiler outputs don't help, you could try looking at
+    the result of :func:`torch.autograd.profiler.emit_nvtx()` with ``nvprof``.
+    However, please take into account that the NVTX overhead is very high and
+    often gives a heavily skewed timeline. Similarly, ``Intel® VTune™ Profiler``
+    helps to analyze performance on Intel platforms further with
+    :func:`torch.autograd.profiler.emit_itt()`.
+
+.. warning::
+    If you are profiling CUDA code, the first profiler that ``bottleneck`` runs
+    (cProfile) will include the CUDA startup time (CUDA buffer allocation cost)
+    in its time reporting. This should not matter if your bottlenecks result
+    in code much slower than the CUDA startup time.
+
+For more complicated uses of the profilers (like in a multi-GPU case),
+please see https://docs.python.org/3/library/profile.html
+or :func:`torch.autograd.profiler.profile()` for more information.
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -66,7 +66,6 @@ extensions = [
    "sphinx.ext.linkcode",
    "sphinxcontrib.mermaid",
    "sphinx_sitemap",
-    "sphinx_remove_toctrees"
 ]

 myst_enable_extensions = [
@ -75,9 +74,6 @@ myst_enable_extensions = [
    "html_image",
 ]

-# Remove the "generated" tag from the toctree to allow for faster builds
-remove_from_toctrees = ["generated/*"]
-
 html_baseurl = "https://docs.pytorch.org/docs/stable/"  # needed for sphinx-sitemap
 sitemap_locales = [None]
 sitemap_excludes = [
@ -97,10 +93,8 @@ numpydoc_show_class_members = False
 # autosectionlabel throws warnings if section names are duplicated.
 # The following tells autosectionlabel to not throw a warning for
 # duplicated section names that are in different documents.
-
 autosectionlabel_prefix_document = True

-
 # katex options
 #
 #
@ -213,41 +207,6 @@ templates_path = [
 ]
 # TODO: document these and remove them from here.

-autosummary_filename_map = {
-    'torch.nn.utils.prune.identity': 'torch.nn.utils.prune.identity_function',
-    'torch.nn.utils.prune.Identity': 'torch.nn.utils.prune.Identity_class',
-    'torch.optim.adamw.adamw': 'torch.optim.adamw.adamw_function',
-    'torch.optim.adamw.AdamW': 'torch.optim.adamw.AdamW_class',
-    'torch.optim.asgd.asgd': 'torch.optim.asgd.asgd_function',
-    'torch.optim.asgd.ASGD': 'torch.optim.asgd.ASGD_class',
-    'torch.optim.nadam.nadam': 'torch.optim.nadam.nadam_function',
-    'torch.optim.nadam.NAdam': 'torch.optim.nadam.NAdam_class',
-    'torch.optim.radam.radam': 'torch.optim.radam.radam_function',
-    'torch.optim.radam.RAdam': 'torch.optim.radam.RAdam_class',
-    'torch.optim.rmsprop.rmsprop': 'torch.optim.rmsprop.rmsprop_function',
-    'torch.optim.rmsprop.RMSprop': 'torch.optim.rmsprop.RMSprop_class',
-    'torch.optim.rprop.rprop': 'torch.optim.rprop.rprop_function',
-    'torch.optim.rprop.Rprop': 'torch.optim.rprop.Rprop_class',
-    'torch.optim.sgd.sgd': 'torch.optim.sgd.sgd_function',
-    'torch.optim.sgd.SGD': 'torch.optim.sgd.SGD_class',
-    'torch.optim.adadelta.adadelta': 'torch.optim.adadelta.adadelta_function',
-    'torch.optim.adadelta.Adadelta': 'torch.optim.adadelta.Adadelta_class',
-    'torch.optim.adagrad.adagrad': 'torch.optim.adagrad.adagrad_function',
-    'torch.optim.adagrad.Adagrad': 'torch.optim.adagrad.Adagrad_class',
-    'torch.optim.adam.adam': 'torch.optim.adam.adam_function',
-    'torch.optim.adam.Adam': 'torch.optim.adam.Adam_class',
-    'torch.optim.adamax.adamax': 'torch.optim.adamax.adamax_function',
-    'torch.optim.adamax.Adamax': 'torch.optim.adamax.Adamax_class',
-    'torch.mtia.stream': 'torch.mtia.stream_function',
-    'torch.mtia.Stream': 'torch.mtia.Stream_class',
-    'torch.cpu.stream': 'torch.cpu.stream_function',
-    'torch.cpu.Stream': 'torch.cpu.Stream_class',
-    'torch.cuda.stream': 'torch.cuda.stream_function',
-    'torch.cuda.Stream': 'torch.cuda.Stream_class',
-    'torch.xpu.stream': 'torch.xpu.stream_function',
-    'torch.xpu.Stream': 'torch.xpu.Stream_class',
-}
-
 coverage_ignore_functions = [
    # torch
    "typename",
--- a/docs/source/deterministic.md
+++ b/docs/source/deterministic.md
@ -21,10 +21,10 @@
    The following operations will fill uninitialized memory when this setting is
    turned on:

-        * :meth:`torch.Tensor.resize_` when called with a tensor that is not
+        * :func:`torch.Tensor.resize_` when called with a tensor that is not
          quantized
        * :func:`torch.empty`
        * :func:`torch.empty_strided`
        * :func:`torch.empty_permuted`
        * :func:`torch.empty_like`
-```
+```
--- a/docs/source/distributed.tensor.md
+++ b/docs/source/distributed.tensor.md
@ -260,73 +260,3 @@ these features.
 ```{eval-rst}
 .. py:module:: torch.distributed.tensor.device_mesh
 ```
-
-## Mixed Tensor and DTensor operations
-
-So you got the following error message.
-```
-got mixed torch.Tensor and DTensor, need to convert all
-torch.Tensor to DTensor before calling distributed operators!
-```
-
-There are two cases.
-
-### Case 1: this is user error
-
-The most common way to run into this error is to create a regular Tensor
-(using a factory function) and then perform a Tensor-DTensor operation,
-like the following:
-
-```
-tensor = torch.arange(10)
-return tensor + dtensor
-```
-
-We disallow mixed Tensor-DTensor operations: if the input to any operations
-(e.g. torch.add) is a DTensor, then all Tensor inputs must be DTensors.
-This is because the semantics are ambiguous. We don't know if `tensor` is
-the same across ranks or if it is different so we ask that the user
-figure out how to construct a DTensor with accurate placements from `tensor`.
-
-If each rank does have the same `tensor`, then please construct a replicated
-DTensor:
-
-```
-tensor = torch.arange(10)
-tensor = DTensor.from_local(tensor, placements=(Replicate(),))
-return tensor + dtensor
-```
-
-If you wanted to create a DTensor with shards, below is how to do it.
-Semantically this means that your Tensor data is split between the shards
-and that operations act on the "full stacked data".
-
-```
-tensor = torch.full([], RANK)
-tensor = DTensor.from_local(tensor, placements=(Shard(0),))
-return tensor + dtensor
-```
-
-There are other things you may wish to do with your tensor beyond
-these situations (these are not the only two options!).
-
-## Case 2: the error came from PyTorch framework code
-
-Sometimes the problem is that PyTorch framework code attempts to perform mixed
-Tensor-DTensor operations. These are bugs in PyTorch, please file an issue
-so that we can fix them.
-
-On the user side, the only thing you can do is to avoid using the operation
-that caused the issue and file a bug report.
-
-For PyTorch Developers: one approach of fixing this is to rewrite PyTorch
-framework code to avoid mixed Tensor-DTensor code (like in the previous section).
-
-For PyTorch Developers: the second approach is to turn on DTensor implicit
-replication inside the right places in PyTorch framework code.
-When on, any mixed Tensor-DTensor operations will assume that the
-non-DTensors can be replicated. Please be careful when using this as it
-can lead to silent incorrectness.
-
- [Turning on implicit replication in Python](https://github.com/pytorch/pytorch/blob/d8e6b2fddc54c748d976e8f0ebe4b63ebe36d85b/torch/distributed/tensor/experimental/__init__.py#L15)
- [Turning on implicit replication in C++](https://github.com/pytorch/pytorch/blob/7a0f93344e2c851b9bcf2b9c3225a323d48fde26/aten/src/ATen/DTensorState.h#L10)
--- a/docs/source/fx.experimental.md
+++ b/docs/source/fx.experimental.md
@ -8,10 +8,6 @@
 These APIs are experimental and subject to change without notice.
 :::

-```{eval-rst}
-.. autoclass:: torch.fx.experimental.sym_node.DynamicInt
-```
-
 ## torch.fx.experimental.symbolic_shapes

 ```{eval-rst}
--- a/docs/source/pytorch-api.md
+++ b/docs/source/pytorch-api.md
@ -76,6 +76,7 @@ storage
 torch.testing <testing>
 torch.utils <utils>
 torch.utils.benchmark <benchmark_utils>
+torch.utils.bottleneck <bottleneck>
 torch.utils.checkpoint <checkpoint>
 torch.utils.cpp_extension <cpp_extension>
 torch.utils.data <data>
--- a/setup.py
+++ b/setup.py
@ -227,6 +227,9 @@
 #      Static link mimalloc into C10, and use mimalloc in alloc_cpu & alloc_free.
 #      By default, It is only enabled on Windows.
 #
+#   USE_PRIORITIZED_TEXT_FOR_LD
+#      Uses prioritized text form cmake/prioritized_text.txt for LD
+#
 #   BUILD_LIBTORCH_WHL
 #      Builds libtorch.so and its dependencies as a wheel
 #
@ -320,6 +323,7 @@ from tools.setup_helpers.env import (
    IS_LINUX,
    IS_WINDOWS,
 )
+from tools.setup_helpers.generate_linker_script import gen_linker_script


 def str2bool(value: str | None) -> bool:
@ -1623,6 +1627,26 @@ def main() -> None:
    if BUILD_PYTHON_ONLY:
        install_requires += [f"{LIBTORCH_PKG_NAME}=={TORCH_VERSION}"]

+    if str2bool(os.getenv("USE_PRIORITIZED_TEXT_FOR_LD")):
+        gen_linker_script(
+            filein="cmake/prioritized_text.txt", fout="cmake/linker_script.ld"
+        )
+        linker_script_path = os.path.abspath("cmake/linker_script.ld")
+        os.environ["LDFLAGS"] = os.getenv("LDFLAGS", "") + f" -T{linker_script_path}"
+        os.environ["CFLAGS"] = (
+            os.getenv("CFLAGS", "") + " -ffunction-sections -fdata-sections"
+        )
+        os.environ["CXXFLAGS"] = (
+            os.getenv("CXXFLAGS", "") + " -ffunction-sections -fdata-sections"
+        )
+    elif platform.system() == "Linux" and platform.processor() == "aarch64":
+        print_box(
+            """
+            WARNING: we strongly recommend enabling linker script optimization for ARM + CUDA.
+            To do so please export USE_PRIORITIZED_TEXT_FOR_LD=1
+            """
+        )
+
    # Parse the command line and check the arguments before we proceed with
    # building deps and setup. We need to set values so `--help` works.
    dist = Distribution()
--- a/test/ao/sparsity/test_activation_sparsifier.py
+++ b/test/ao/sparsity/test_activation_sparsifier.py
@ -1,4 +1,4 @@
-# Owner(s): ["module: sparse"]
+# Owner(s): ["module: unknown"]

 import copy

--- a/test/ao/sparsity/test_composability.py
+++ b/test/ao/sparsity/test_composability.py
@ -1,4 +1,4 @@
-# Owner(s): ["module: sparse"]
+# Owner(s): ["module: unknown"]


 import torch
--- a/test/ao/sparsity/test_data_scheduler.py
+++ b/test/ao/sparsity/test_data_scheduler.py
@ -1,4 +1,4 @@
-# Owner(s): ["module: sparse"]
+# Owner(s): ["module: unknown"]

 import copy
 import warnings
--- a/test/ao/sparsity/test_data_sparsifier.py
+++ b/test/ao/sparsity/test_data_sparsifier.py
@ -1,4 +1,4 @@
-# Owner(s): ["module: sparse"]
+# Owner(s): ["module: unknown"]

 import copy
 import itertools
--- a/test/ao/sparsity/test_kernels.py
+++ b/test/ao/sparsity/test_kernels.py
@ -1,4 +1,4 @@
-# Owner(s): ["module: sparse"]
+# Owner(s): ["module: unknown"]

 import copy
 import io
--- a/test/ao/sparsity/test_parametrization.py
+++ b/test/ao/sparsity/test_parametrization.py
@ -1,4 +1,4 @@
-# Owner(s): ["module: sparse"]
+# Owner(s): ["module: unknown"]


 import torch
--- a/test/ao/sparsity/test_scheduler.py
+++ b/test/ao/sparsity/test_scheduler.py
@ -1,4 +1,4 @@
-# Owner(s): ["module: sparse"]
+# Owner(s): ["module: unknown"]

 import warnings

--- a/test/ao/sparsity/test_sparsifier.py
+++ b/test/ao/sparsity/test_sparsifier.py
@ -1,4 +1,4 @@
-# Owner(s): ["module: sparse"]
+# Owner(s): ["module: unknown"]

 import itertools
 import re
--- a/test/ao/sparsity/test_sparsity_utils.py
+++ b/test/ao/sparsity/test_sparsity_utils.py
@ -1,4 +1,4 @@
-# Owner(s): ["module: sparse"]
+# Owner(s): ["module: unknown"]


 import logging
--- a/test/ao/sparsity/test_structured_sparsifier.py
+++ b/test/ao/sparsity/test_structured_sparsifier.py
@ -1,4 +1,4 @@
-# Owner(s): ["module: sparse"]
+# Owner(s): ["module: unknown"]
 import copy
 import random

--- a/test/bottleneck_test/test.py
+++ b/test/bottleneck_test/test.py
@ -0,0 +1,7 @@
+# Owner(s): ["module: unknown"]
+
+import torch
+
+
+x = torch.ones((3, 3), requires_grad=True)
+(3 * x).sum().backward()
--- a/test/bottleneck_test/test_args.py
+++ b/test/bottleneck_test/test_args.py
@ -0,0 +1,17 @@
+# Owner(s): ["module: unknown"]
+
+import argparse
+
+import torch
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    # Required args. Raises error if they aren't passed.
+    parser.add_argument("--foo", help="foo", required=True)
+    parser.add_argument("--bar", help="bar", required=True)
+    _ = parser.parse_args()
+
+    x = torch.ones((3, 3), requires_grad=True)
+    (3 * x).sum().backward()
--- a/test/bottleneck_test/test_cuda.py
+++ b/test/bottleneck_test/test_cuda.py
@ -0,0 +1,29 @@
+# Owner(s): ["module: unknown"]
+
+import torch
+import torch.nn as nn
+
+
+class Model(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.linear = nn.Linear(20, 20)
+
+    def forward(self, input):
+        out = self.linear(input[:, 10:30])
+        return out.sum()
+
+
+def main():
+    data = torch.randn(10, 50).cuda()
+    model = Model().cuda()
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)
+    for _ in range(10):
+        optimizer.zero_grad()
+        loss = model(data)
+        loss.backward()
+        optimizer.step()
+
+
+if __name__ == "__main__":
+    main()
--- a/test/distributed/_composable/fsdp/test_fully_shard_state_dict.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_state_dict.py
@ -117,49 +117,6 @@ class TestFullyShardStateDictMultiProcess(FSDPTest):
        for key, value in ref_sharded_sd.items():
            self.assertEqual(value, sharded_sd[key])

-    @skip_if_lt_x_gpu(2)
-    def test_cached_state_dict(self):
-        self.run_subtests(
-            {"mlp_dim": [2, 3, 4, 5], "mutate_after_state_dict": [True, False]},
-            self._test_cached_state_dict,
-        )
-
-    def _test_cached_state_dict(self, mlp_dim: int, mutate_after_state_dict: bool):
-        torch.manual_seed(42)
-        model = nn.Linear(mlp_dim, mlp_dim, bias=False)
-        fully_shard(model, reshard_after_forward=True)
-        optim = torch.optim.AdamW(model.parameters(), lr=1e-2)
-
-        # call .state_dict() once and use `sd` directly to reduce cpu overhead
-        sd = model.state_dict()
-        assert isinstance(model.weight, DTensor)
-
-        if not mutate_after_state_dict:
-            self.assertTrue(
-                sd["weight"]._local_tensor.untyped_storage().data_ptr()
-                == model.weight._local_tensor.untyped_storage().data_ptr()
-            )
-        else:
-            model = model.cpu()
-            model = model.cuda()
-            self.assertTrue(
-                sd["weight"]._local_tensor.untyped_storage().data_ptr()
-                != model.weight._local_tensor.untyped_storage().data_ptr()
-            )
-
-        torch.manual_seed(42 + self.rank)
-        inp = torch.rand(mlp_dim, mlp_dim, device="cuda")
-        for _ in range(5):
-            optim.zero_grad()
-            loss = model(inp).sum()
-            loss.backward()
-            optim.step()
-            if not mutate_after_state_dict:
-                self.assertTrue(
-                    sd["weight"]._local_tensor.untyped_storage().data_ptr()
-                    == model.weight._local_tensor.untyped_storage().data_ptr()
-                )
-
    @skip_if_lt_x_gpu(2)
    def test_dp_state_dict_cpu_offload(self):
        self.run_subtests(
--- a/test/distributed/_composable/test_replicate_training.py
+++ b/test/distributed/_composable/test_replicate_training.py
@ -1,47 +1,22 @@
 # Owner(s): ["oncall: distributed"]

-import contextlib
 import copy
-import functools
-import itertools
-import unittest
 from collections.abc import Iterable
-from typing import Union

 import torch
 import torch.distributed as dist
 import torch.nn as nn
 from torch.distributed._composable.replicate_with_fsdp import replicate
-from torch.distributed.fsdp import CPUOffloadPolicy, FSDPModule, OffloadPolicy
-from torch.distributed.tensor import DTensor, init_device_mesh
+from torch.distributed.fsdp import FSDPModule
+from torch.distributed.tensor import DTensor
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
-from torch.testing._internal.common_fsdp import (
-    check_sharded_parity,
-    compiled_fsdp_test,
-    FSDPTest,
-    FSDPTestMultiThread,
-    MLP,
-    patch_all_gather,
-    patch_reduce_scatter,
-)
-from torch.testing._internal.common_utils import (
-    get_cycles_per_ms,
-    run_tests,
-    TEST_HPU,
-    wrapSwapTensorsTest,
-)
-from torch.testing._internal.distributed._tensor.common_dtensor import (
-    ModelArgs,
-    Transformer,
-    TransformerBlock,
-)
+from torch.testing._internal.common_fsdp import FSDPTestMultiThread, get_devtype, MLP
+from torch.testing._internal.common_utils import run_tests


 c10d_ops = torch.ops.c10d
 funcol = torch.ops.c10d_functional

-from torch.testing._internal.common_fsdp import get_devtype
-

 device_type = torch.device(get_devtype())

@ -194,463 +169,5 @@ class TestReplicateRegisteredParams(FSDPTestMultiThread):
            self.assertEqual(param, ref_param)


-class TestReplicateCastAfterInit(FSDPTestMultiThread):
-    @property
-    def world_size(self) -> int:
-        return 2
-
-    @skip_if_lt_x_gpu(1)
-    @wrapSwapTensorsTest(True)
-    def test_to_float64_after_init(self):
-        """Tests that the user can cast the module to float64 after init."""
-        # NOTE: Test fp64 instead of a lower precision dtype like bf16 for
-        # better numerics. The important part is changing the dtype.
-
-        torch.manual_seed(42)
-        mlp_dim, device, dtype = 4, device_type, torch.float64
-        model = MLP(mlp_dim, device=device)
-        for param in model.parameters():
-            dist.broadcast(param, src=0)
-        ref_model = copy.deepcopy(model).to(dtype)
-
-        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
-        for module in (model.in_proj, model.out_proj, model):
-            replicate(module)
-        model.to(dtype)
-        for param in model.parameters():
-            self.assertEqual(param.dtype, dtype)
-            self.assertEqual(param.to_local().dtype, dtype)
-            self.assertEqual(param._spec.tensor_meta.dtype, dtype)
-        optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=True)
-        check_sharded_parity(self, ref_model, model)
-        torch.manual_seed(42 + self.rank + 1)
-        inp = torch.randn((2, mlp_dim), device=device_type.type, dtype=dtype)
-        for iter_idx in range(10):
-            losses: list[torch.Tensor] = []
-            for _model in (ref_model, model):
-                losses.append(_model(inp).sum())
-                losses[-1].backward()
-
-            for param in ref_model.parameters():
-                if param.grad is not None:
-                    dist.all_reduce(param.grad)
-                    param.grad.div_(self.world_size)
-
-            self.assertEqual(losses[0], losses[1])
-            check_sharded_parity(self, ref_model, model)
-            for param in model.parameters():
-                self.assertEqual(param.dtype, dtype)
-                self.assertEqual(param.to_local().dtype, dtype)
-                self.assertEqual(param._spec.tensor_meta.dtype, dtype)
-                self.assertEqual(param.grad.dtype, dtype)
-                self.assertEqual(param.grad.to_local().dtype, dtype)
-                self.assertEqual(param.grad._spec.tensor_meta.dtype, dtype)
-            for _optim in (ref_optim, optim):
-                _optim.step()
-                _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
-
-
-class TestReplicate1DTrainingCore(FSDPTest):
-    @property
-    def world_size(self) -> int:
-        return min(8, torch.get_device_module(device_type).device_count())
-
-    @skip_if_lt_x_gpu(2)
-    def test_train_parity_single_group(self):
-        """
-        Tests train parity with DDP for a single FSDP group when sharding
-        parameters on dim-0.
-        """
-        self.run_subtests(
-            {
-                "lin_shapes": [
-                    [(16, 15), (15, 8)],
-                    [(7, 15), (15, 3)],
-                    [(16, 17), (17, 8)],
-                ],
-                "use_shard_placement_fn": [False],
-            },
-            self._test_train_parity_single_group,
-        )
-
-    def _test_train_parity_single_group(
-        self, lin_shapes: list[tuple[int, int]], use_shard_placement_fn: bool
-    ):
-        torch.manual_seed(42)
-        model = nn.Sequential(
-            nn.Linear(*lin_shapes[0]), nn.ReLU(), nn.Linear(*lin_shapes[1])
-        )
-        ref_model = copy.deepcopy(model).to(device_type)
-        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
-
-        replicate(model)
-        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
-        torch.manual_seed(42 + self.rank + 1)
-        inp = (torch.randn((4, lin_shapes[0][0]), device=device_type.type),)
-        for iter_idx in range(10):
-            losses: list[torch.Tensor] = []
-            for _model in (ref_model, model):
-                losses.append(_model(*inp).sum())
-                losses[-1].backward()
-
-            for param in ref_model.parameters():
-                if param.grad is not None:
-                    dist.all_reduce(param.grad)
-                    param.grad.div_(self.world_size)
-
-            for _optim in (ref_optim, optim):
-                _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
-                _optim.step()
-            self.assertEqual(losses[0], losses[1])
-
-    @skip_if_lt_x_gpu(2)
-    @unittest.skipIf(TEST_HPU, "Sleep kernel not supported for HPU")
-    @compiled_fsdp_test(compile_compute_on_module=Transformer)
-    def test_train_parity_multi_groups(self):
-        """
-        Tests train parity against DDP when using multiple parameter groups for
-        communication (for communication and computation overlap plus memory
-        reduction).
-        """
-        self.run_subtests(
-            {
-                "reshard_after_forward": [True, False],
-                "test_device_type": [device_type.type],
-                "offload_policy": [OffloadPolicy()],
-                "delay_after_forward": [False, True],
-                "delay_before_all_gather": [False, True],
-                "delay_before_reduce_scatter": [False, True],
-                "delay_before_optim": [False, True],
-                "unshard_async_op": [False],
-            },
-            self._test_train_parity_multi_group,
-        )
-
-    @skip_if_lt_x_gpu(2)
-    @unittest.skipIf(TEST_HPU, "sleep kernel not supported on HPU")
-    def test_train_parity_multi_group_cpu_offload_eager(self):
-        """
-        Tests train parity when using multiple parameter groups for
-        communication and CPU offloading.
-        """
-        self.run_subtests(
-            {
-                "reshard_after_forward": [True],  # save CI time
-                "offload_policy": [
-                    CPUOffloadPolicy(pin_memory=True),
-                    CPUOffloadPolicy(pin_memory=False),
-                ],
-                "test_device_type": [device_type.type],
-                "delay_after_forward": [False, True],
-                "delay_before_all_gather": [False, True],
-                "delay_before_reduce_scatter": [False, True],
-                "delay_before_optim": [False, True],
-                "unshard_async_op": [False],
-            },
-            self._test_train_parity_multi_group,
-        )
-
-    def _test_train_parity_multi_group(
-        self,
-        reshard_after_forward: Union[bool, int],
-        offload_policy: OffloadPolicy,
-        test_device_type: str,
-        delay_after_forward: bool,
-        delay_before_all_gather: bool,
-        delay_before_reduce_scatter: bool,
-        delay_before_optim: bool,
-        unshard_async_op: bool,
-    ):
-        # Only test individual delays or all four delays to save test time
-        if (
-            delay_after_forward
-            + delay_before_all_gather
-            + delay_before_reduce_scatter
-            + delay_before_optim
-            in (2, 3)
-        ):
-            return
-        assert test_device_type in ("cuda", "hpu", "xpu", "cpu"), f"{test_device_type}"
-        torch.manual_seed(42)
-        vocab_size = 1024
-        model_args = ModelArgs(
-            n_layers=3,
-            n_heads=4,
-            vocab_size=vocab_size,
-            max_seq_len=64,
-            dropout_p=0,
-        )
-        model = Transformer(model_args)
-        ref_model = copy.deepcopy(model).to(device_type)
-
-        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
-        mesh = init_device_mesh(
-            test_device_type,
-            (self.world_size, 1),
-            mesh_dim_names=("replicate", "shard"),
-        )
-        fully_shard_fn = functools.partial(
-            replicate,
-            device_mesh=mesh,
-            reshard_after_forward=reshard_after_forward,
-            offload_policy=offload_policy,
-        )
-        for module in model.modules():
-            if isinstance(module, TransformerBlock):
-                fully_shard_fn(module)
-        fully_shard_fn(model)
-        if unshard_async_op:
-            model._set_unshard_async_op(unshard_async_op)
-        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
-
-        delay_in_ms = 100
-        orig_all_gather = dist.all_gather_into_tensor
-        orig_reduce_scatter = dist.reduce_scatter_tensor
-
-        def delayed_all_gather(*args, **kwargs):
-            torch.get_device_module(device_type)._sleep(
-                int(delay_in_ms * get_cycles_per_ms())
-            )
-            return orig_all_gather(*args, **kwargs)
-
-        def delayed_reduce_scatter(*args, **kwargs):
-            torch.get_device_module(device_type)._sleep(
-                int(delay_in_ms * get_cycles_per_ms())
-            )
-            return orig_reduce_scatter(*args, **kwargs)
-
-        torch.manual_seed(42 + self.rank + 1)
-        patch_all_gather_ctx = (
-            patch_all_gather(delayed_all_gather)
-            if delay_before_all_gather
-            else contextlib.nullcontext()
-        )
-        patch_reduce_scatter_ctx = (
-            patch_reduce_scatter(delayed_reduce_scatter)
-            if delay_before_reduce_scatter
-            else contextlib.nullcontext()
-        )
-        with patch_all_gather_ctx, patch_reduce_scatter_ctx:
-            for iter_idx in range(10):
-                inp = torch.randint(0, vocab_size, (3, 64), device=device_type)
-                losses: list[torch.Tensor] = []
-                for _model, _optim in ((ref_model, ref_optim), (model, optim)):
-                    losses.append(_model(inp).sum())
-                    if _model is model and delay_after_forward:
-                        torch.get_device_module(device_type)._sleep(
-                            int(delay_in_ms * get_cycles_per_ms())
-                        )
-                    losses[-1].backward()
-                    if _model is model and delay_before_optim:
-                        torch.get_device_module(device_type)._sleep(
-                            int(delay_in_ms * get_cycles_per_ms())
-                        )
-
-                for param in ref_model.parameters():
-                    if param.grad is not None:
-                        dist.all_reduce(param.grad)
-                        param.grad.div_(self.world_size)
-
-                for _optim in (ref_optim, optim):
-                    _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
-                    _optim.step()
-                self.assertEqual(losses[0], losses[1])
-
-    @skip_if_lt_x_gpu(2)
-    def test_non_root_forward_backward(self):
-        """
-        Tests running forward/backward through the root and then through a
-        non-root. The non-root needs to synchronize streams/queue the callback.
-        """
-        torch.manual_seed(42)
-        lin_dim = 32
-        model = nn.Sequential(*[MLP(lin_dim, torch.device("cpu")) for _ in range(3)])
-        ref_model = copy.deepcopy(model).to(device_type)
-        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
-        for mlp in model:
-            replicate(mlp)
-        replicate(model)
-        optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=True)
-        torch.manual_seed(42 + self.rank)
-        inp = torch.randn((8, lin_dim), device=device_type)
-
-        ref_root_loss = ref_model(inp).sum()
-        ref_root_loss.backward()
-        for param in ref_model.parameters():
-            dist.all_reduce(param.grad)
-            param.grad.detach().div_(self.world_size)
-        ref_optim.step()
-        ref_optim.zero_grad()
-        ref_nonroot_loss = ref_model[0](inp).sum()
-        ref_nonroot_loss.backward()
-        for param in ref_model.parameters():
-            if param.grad is not None:
-                dist.all_reduce(param.grad)
-                param.grad.detach().div_(self.world_size)
-        ref_optim.step()
-
-        root_loss = model(inp).sum()
-        root_loss.backward()
-        torch.get_device_module(device_type)._sleep(int(100 * get_cycles_per_ms()))
-        optim.step()
-        optim.zero_grad()
-        nonroot_loss = model[0](inp).sum()
-        nonroot_loss.backward()
-        optim.step()
-
-        self.assertEqual(ref_root_loss, root_loss)
-        self.assertEqual(ref_nonroot_loss, nonroot_loss)
-        self.assertEqual(ref_model(inp).sum(), model(inp).sum())
-
-    @skip_if_lt_x_gpu(2)
-    def test_multi_forward_module(self):
-        """
-        Tests parity when running a module that participates multiple
-        times in forward.
-        """
-        self.run_subtests(
-            {"reshard_after_forward": [True, False]},
-            self._test_multi_forward_module,
-        )
-
-    def _test_multi_forward_module(self, reshard_after_forward: Union[bool, int]):
-        class MultiForwardModule(nn.Module):
-            def __init__(self, device: torch.device):
-                super().__init__()
-                self.inner = nn.Linear(4, 4, device=device)
-                self.outer = nn.Linear(4, 5, device=device)
-
-            def forward(self, x):
-                i = self.inner(x)
-                j = self.inner(x)
-                return self.outer(i + j)
-
-        torch.manual_seed(42)
-        model = MultiForwardModule(device=device_type.type)
-        ref_model = copy.deepcopy(model).to(device_type)
-
-        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
-        replicate(model.inner)
-        replicate(model)
-        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
-
-        torch.manual_seed(42 + self.rank)
-        inp = torch.randn((32, 4), device=device_type.type)
-        for iter_idx in range(10):
-            losses: list[torch.Tensor] = []
-            for _model in (ref_model, model):
-                losses.append(_model(inp).sum())
-                losses[-1].backward()
-
-            for param in ref_model.parameters():
-                if param.grad is not None:
-                    dist.all_reduce(param.grad)
-                    param.grad.div_(self.world_size)
-
-            for _optim in (ref_optim, optim):
-                _optim.zero_grad(set_to_none=(iter_idx % 2 == 0))
-                _optim.step()
-
-            self.assertEqual(losses[0], losses[1])
-
-    @skip_if_lt_x_gpu(2)
-    def test_explicit_prefetching(self):
-        torch.manual_seed(42)
-        model_args = ModelArgs(n_layers=8, dropout_p=0.0)
-        model = Transformer(model_args)
-        ref_model = copy.deepcopy(model).to(device_type)
-        ref_optim = torch.optim.AdamW(ref_model.parameters(), lr=1e-2)
-
-        for layer in itertools.chain(model.layers, [model]):
-            replicate(layer)
-        optim = torch.optim.AdamW(model.parameters(), lr=1e-2)
-
-        num_to_forward_prefetch = num_to_backward_prefetch = 2
-        for i, layer in enumerate(model.layers):
-            if i >= len(model.layers) - num_to_forward_prefetch:
-                break
-            layers_to_prefetch = [
-                model.layers[i + j] for j in range(1, num_to_forward_prefetch + 1)
-            ]
-            layer.set_modules_to_forward_prefetch(layers_to_prefetch)
-        for i, layer in enumerate(model.layers):
-            if i < num_to_backward_prefetch:
-                continue
-            layers_to_prefetch = [
-                model.layers[i - j] for j in range(1, num_to_backward_prefetch + 1)
-            ]
-            layer.set_modules_to_backward_prefetch(layers_to_prefetch)
-
-        torch.manual_seed(42 + self.rank)
-        inp = torch.randint(0, model_args.vocab_size, (2, 8), device=device_type.type)
-        for _ in range(10):
-            losses: list[torch.Tensor] = []
-
-            for _model in (ref_model, model):
-                losses.append(_model(inp).sum())
-                losses[-1].backward()
-
-            for param in ref_model.parameters():
-                if param.grad is not None:
-                    dist.all_reduce(param.grad)
-                    param.grad.div_(self.world_size)
-
-            for _optim in (ref_optim, optim):
-                _optim.zero_grad()
-                _optim.step()
-
-            self.assertEqual(losses[0], losses[1])
-
-    @skip_if_lt_x_gpu(2)
-    @unittest.skipIf(TEST_HPU, "Sleep is not supported on HPU")
-    def test_post_optim_event(self):
-        torch.manual_seed(42)
-        model_args = ModelArgs(dropout_p=0.0)
-        model = Transformer(model_args)
-        ref_model = copy.deepcopy(model).to(device_type.type)
-        ref_optim = torch.optim.AdamW(ref_model.parameters(), lr=1e-2)
-        for layer in itertools.chain(model.layers, [model]):
-            replicate(layer)
-        optim = torch.optim.AdamW(model.parameters(), lr=1e-2)
-
-        def step_post_hook(
-            fsdp_module: FSDPModule, opt: torch.optim.Optimizer, args, kwargs
-        ) -> None:
-            post_optim_event = (
-                torch.get_device_module(device_type).current_stream().record_event()
-            )
-            fsdp_module.set_post_optim_event(post_optim_event)
-
-        optim.register_step_post_hook(functools.partial(step_post_hook, model))
-
-        torch.manual_seed(42 + self.rank)
-        inp = torch.randint(0, model_args.vocab_size, (2, 8), device=device_type.type)
-        # Track all losses and check for equality at the end to avoid a CPU
-        # sync point after each iteration
-        ref_losses: list[torch.Tensor] = []
-        losses: list[torch.Tensor] = []
-        for _ in range(10):
-            ref_optim.zero_grad()
-            ref_losses.append(ref_model(inp).sum())
-            ref_losses[-1].backward()
-
-            for param in ref_model.parameters():
-                if param.grad is not None:
-                    dist.all_reduce(param.grad)
-                    param.grad.div_(self.world_size)
-
-            ref_optim.step()
-        for _ in range(10):
-            optim.zero_grad()
-            losses.append(model(inp).sum())
-            losses[-1].backward()
-            optim.step()
-            # Sleep after the optimizer step to allow CPU to run ahead into the
-            # next iteration's forward, exercising the post-optim stream sync
-            torch.get_device_module(device_type)._sleep(int(25 * get_cycles_per_ms()))
-        for ref_loss, loss in zip(ref_losses, losses):
-            self.assertEqual(ref_loss, loss)
-
-
 if __name__ == "__main__":
    run_tests()
--- a/test/distributed/tensor/debug/test_debug_mode.py
+++ b/test/distributed/tensor/debug/test_debug_mode.py
@ -41,7 +41,7 @@ class TestDTensorDebugMode(TestCase):
        x_dtensor = DTensor.from_local(x, mesh, [Shard(0)], run_check=False)
        y_dtensor = DTensor.from_local(y, mesh, [Shard(0)], run_check=False)

-        with DebugMode(record_torchfunction=True) as debug_mode:
+        with DebugMode() as debug_mode:
            torch.mm(x_dtensor, y_dtensor).sum()

        self.assertExpectedInline(
@ -80,7 +80,7 @@ class TestDTensorDebugMode(TestCase):
        x_dtensor = DTensor.from_local(x, mesh, [Shard(0)], run_check=False)
        y_dtensor = DTensor.from_local(y, mesh, [Shard(1)], run_check=False)

-        with DebugMode(record_torchfunction=True) as debug_mode:
+        with DebugMode() as debug_mode:
            z = x_dtensor + y_dtensor
            z.sum().backward()

@ -121,7 +121,7 @@ class TestDTensorDebugMode(TestCase):
        b_dt = DTensor.from_local(b, mesh, [Replicate(), Partial()], run_check=False)

        # Capture the operator decomposition
-        with DebugMode(record_torchfunction=True) as debug_mode:
+        with DebugMode() as debug_mode:
            torch.einsum("bld,dnh->blnh", a_dt, b_dt)

        self.assertExpectedInline(
@ -176,7 +176,7 @@ class TestDTensorDebugMode(TestCase):
        x = torch.randn(8, 8, 8)
        linear = torch.nn.Linear(8, 8)

-        with DebugMode(record_torchfunction=True) as debug_mode:
+        with DebugMode() as debug_mode:
            linear(x).sum()

        self.assertExpectedInline(
@ -196,7 +196,7 @@ class TestDTensorDebugMode(TestCase):
            x = torch.randn(8, 8)
            y = torch.randn(8, 8, 8)

-        with DebugMode(record_torchfunction=True, record_faketensor=True) as debug_mode:
+        with DebugMode(record_faketensor=True) as debug_mode:
            torch.matmul(y, x)

        self.assertExpectedInline(
--- a/test/distributed/tensor/test_attention.py
+++ b/test/distributed/tensor/test_attention.py
@ -9,11 +9,12 @@ from typing import Union
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F
-from torch import Tensor
+from torch import nn, Tensor
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.tensor import DeviceMesh
 from torch.distributed.tensor.debug import CommDebugMode
 from torch.distributed.tensor.experimental._attention import (
+    _AttentionContextParallel,
    _CausalBehavior,
    _cp_options,
    _DispatchMode,
@ -23,6 +24,7 @@ from torch.distributed.tensor.experimental._attention import (
    context_parallel_unshard,
    set_rotate_method,
 )
+from torch.distributed.tensor.parallel import parallelize_module
 from torch.nn.attention import sdpa_kernel, SDPBackend
 from torch.nn.attention.flex_attention import (
    _mask_mod_signature,
@ -40,6 +42,8 @@ from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_utils import run_tests, skipIfRocm
 from torch.testing._internal.distributed._tensor.common_dtensor import (
    DTensorTestBase,
+    ModelArgs,
+    Transformer,
    with_comms,
 )

@ -269,6 +273,180 @@ class RingAttentionTest(DTensorTestBase):
                    behavior,
                )

+    @skip_if_lt_x_gpu(2)
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
+    )
+    @with_comms
+    def test_ring_attention_native_transformer(self) -> None:
+        self.run_subtests(
+            {
+                "is_causal": [True, False],
+                "rotater": [_RotateMethod.ALL_GATHER, _RotateMethod.ALL_TO_ALL],
+            },
+            self._test_ring_attention_native_transformer,
+        )
+
+    @sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION])
+    def _test_ring_attention_native_transformer(
+        self, is_causal: bool, rotater: _RotateMethod
+    ) -> None:
+        _cp_options.enable_load_balance = is_causal
+        set_rotate_method(rotater_enum_to_str[rotater])
+        self.assertEqual(_cp_options.rotate_method, rotater)
+        device_mesh = DeviceMesh(
+            self.device_type,
+            torch.arange(0, self.world_size),
+        )
+        dtype = torch.bfloat16
+        bs = 8
+        ntokens = 8
+        dim = 32
+        nheads = 8
+        num_layers = 2
+
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=dim,
+            nhead=nheads,
+            dim_feedforward=dim,
+            batch_first=True,
+        ).to(dtype)
+        encoder_layer = parallelize_module(
+            module=encoder_layer,
+            device_mesh=device_mesh,
+            parallelize_plan={
+                "self_attn": _AttentionContextParallel(),
+            },
+        )
+        model = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
+        model = model.to(self.device_type).to(dtype)
+
+        mask = (
+            nn.Transformer.generate_square_subsequent_mask(
+                ntokens, device=self.device_type, dtype=dtype
+            )
+            if is_causal
+            else None
+        )
+        seq = torch.rand((bs, ntokens, dim), device=self.device_type, dtype=dtype)
+
+        with CommDebugMode() as comm_mode:
+            out = model(seq, mask=mask, is_causal=is_causal)
+
+        if rotater == _RotateMethod.ALL_TO_ALL:
+            self.assertDictEqual(
+                comm_mode.get_comm_counts(),
+                {
+                    c10d_functional.all_to_all_single: (self.world_size - 1)
+                    * num_layers,
+                },
+            )
+        else:
+            self.assertDictEqual(
+                comm_mode.get_comm_counts(),
+                {
+                    c10d_functional.all_gather_into_tensor: num_layers,
+                },
+            )
+
+        with CommDebugMode() as comm_mode:
+            out.sum().backward()
+
+        if rotater == _RotateMethod.ALL_TO_ALL:
+            self.assertDictEqual(
+                comm_mode.get_comm_counts(),
+                {
+                    c10d_functional.all_to_all_single: (self.world_size * 2 - 1)
+                    * num_layers,
+                },
+            )
+        else:
+            self.assertDictEqual(
+                comm_mode.get_comm_counts(),
+                {
+                    c10d_functional.all_gather_into_tensor: num_layers,
+                    c10d_functional.all_to_all_single: self.world_size * num_layers,
+                },
+            )
+
+    @skip_if_lt_x_gpu(2)
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
+    )
+    @with_comms
+    @sdpa_kernel(backends=[SDPBackend.FLASH_ATTENTION])
+    def test_ring_attention_custom_transformer(self) -> None:
+        self.run_subtests(
+            {"rotater": [_RotateMethod.ALL_GATHER, _RotateMethod.ALL_TO_ALL]},
+            self._test_ring_attention_custom_transformer,
+        )
+
+    def _test_ring_attention_custom_transformer(self, rotater: _RotateMethod) -> None:
+        set_rotate_method(rotater_enum_to_str[rotater])
+        self.assertEqual(_cp_options.rotate_method, rotater)
+        device_mesh = DeviceMesh(
+            self.device_type,
+            torch.arange(0, self.world_size),
+        )
+        # early init DTensor RNG tracker to avoid broadcast be captuured in comm_mode
+        torch.distributed.tensor._random.manual_seed(10, device_mesh)
+
+        dtype = torch.bfloat16
+        bs = 2
+        args = ModelArgs()
+
+        model = Transformer(args).to(dtype).to(self.device_type)
+
+        model = parallelize_module(
+            module=model,
+            device_mesh=device_mesh,
+            parallelize_plan={
+                f"layers.{i}.attention": _AttentionContextParallel()
+                for i in range(args.n_layers)
+            },
+        )
+
+        seq = torch.randint(
+            args.vocab_size, (bs, args.max_seq_len), device=self.device_type
+        )
+
+        with CommDebugMode() as comm_mode:
+            out = model(seq)
+
+        if rotater == _RotateMethod.ALL_TO_ALL:
+            self.assertDictEqual(
+                comm_mode.get_comm_counts(),
+                {
+                    c10d_functional.all_to_all_single: (self.world_size - 1)
+                    * args.n_layers,
+                },
+            )
+        else:
+            self.assertDictEqual(
+                comm_mode.get_comm_counts(),
+                {c10d_functional.all_gather_into_tensor: args.n_layers},
+            )
+
+        with CommDebugMode() as comm_mode:
+            out.sum().backward()
+
+        if rotater == _RotateMethod.ALL_TO_ALL:
+            self.assertDictEqual(
+                comm_mode.get_comm_counts(),
+                {
+                    c10d_functional.all_to_all_single: (self.world_size * 2 - 1)
+                    * args.n_layers,
+                },
+            )
+        else:
+            self.assertDictEqual(
+                comm_mode.get_comm_counts(),
+                {
+                    c10d_functional.all_gather_into_tensor: args.n_layers,
+                    c10d_functional.all_to_all_single: self.world_size * args.n_layers,
+                },
+            )
+

 # Compile the flex_attention function
 compiled_flex_attention = torch.compile(flex_attention, dynamic=False, fullgraph=True)
@ -354,12 +532,12 @@ def generate_doc_mask_mod(
    return doc_mask_mod


-class CPFlexAttentionTest(DTensorTestBase):
+class RingFlexAttentionTest(DTensorTestBase):
    @property
    def world_size(self) -> int:
        return 2

-    def _test_cp_flex_attention(
+    def _test_ring_flex_attention(
        self, qkv_size, B=1, mask_func=causal_mask, atol=1e-6, rtol=1e-2
    ) -> None:
        torch.cuda.manual_seed(10)
@ -408,6 +586,15 @@ class CPFlexAttentionTest(DTensorTestBase):
            mesh_shape=(self.world_size,),
            mesh_dim_names=("cp",),
        )
+        # NOTE: cp needs to know the sharding dimension
+        # TODO: see if this can be moved to the cp context
+        from torch.distributed.tensor.experimental._attention import _set_cp_global_var
+
+        _set_cp_global_var("cp_shard_dim", 2)
+        self.assertEqual(
+            torch.distributed.tensor.experimental._attention._cp_global_vars.cp_shard_dim,
+            2,
+        )

        # NOTE: we do not test load balance here
        _cp_options.enable_load_balance = False
@ -497,17 +684,17 @@ class CPFlexAttentionTest(DTensorTestBase):
    @unittest.skipIf(
        not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
    )
-    def test_cp_flex_attention(self) -> None:
+    def test_ring_flex_attention(self) -> None:
        self.run_subtests(
            {"qkv_size": [128 * self.world_size, 2048]},
-            self._test_cp_flex_attention,
+            self._test_ring_flex_attention,
        )

        # NOTE: Context Parallel should not be used for small attentions (block_size < 128)
        with self.assertRaisesRegex(AssertionError, "Tensor-likes are not close"):
            self.run_subtests(
                {"qkv_size": [64 * self.world_size]},
-                self._test_cp_flex_attention,
+                self._test_ring_flex_attention,
            )

    # TODO: merge with the above test
@ -516,7 +703,7 @@ class CPFlexAttentionTest(DTensorTestBase):
    @unittest.skipIf(
        not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support flash attention"
    )
-    def test_cp_flex_attention_document_mask(self) -> None:
+    def test_ring_flex_attention_document_mask(self) -> None:
        random.seed(10)

        # NOTE: Each (batch_size, seq_len) tuple introduces 2 create_block_mask
@ -537,7 +724,7 @@ class CPFlexAttentionTest(DTensorTestBase):

        # TODO: change this for-loop to run_subtests
        # Use a for-loop instead of run_subtests because we need to intialize the mask
-        # for each subtest. This can be baked into self._test_cp_flex_attention as
+        # for each subtest. This can be baked into self._test_ring_flex_attention as
        # a str argument denoting mask type.
        for batch_size, max_seq_len in itertools.product(
            batch_size_list, max_seq_len_list
@ -551,7 +738,7 @@ class CPFlexAttentionTest(DTensorTestBase):

            # construct testing function
            test_func = functools.partial(
-                self._test_cp_flex_attention,
+                self._test_ring_flex_attention,
                qkv_size=max_seq_len,
                B=batch_size,
                mask_func=document_causal_mask,
--- a/test/distributed/tensor/test_dtensor_ops.py
+++ b/test/distributed/tensor/test_dtensor_ops.py
@ -7,7 +7,7 @@ import warnings
 import torch
 import torch.distributed as dist
 import torch.testing._internal.common_methods_invocations as common_ops
-from torch.distributed.tensor import distribute_tensor, DTensor, init_device_mesh, Shard
+from torch.distributed.tensor import DTensor, init_device_mesh
 from torch.overrides import resolve_name
 from torch.testing._internal.common_device_type import (
    instantiate_device_type_tests,
@ -21,7 +21,6 @@ from torch.testing._internal.distributed._tensor.common_dtensor import (
 )
 from torch.utils import _pytree as pytree
 from torch.utils._pytree import tree_map
-from torch.utils.debug_mode import DebugMode


 # rewrite common size variables to sth can be sharded evenly
@ -118,6 +117,7 @@ dtensor_fails = {
    xfail("cholesky"),
    xfail("cholesky_inverse"),
    xfail("cholesky_solve"),
+    xfail("chunk"),
    xfail("combinations"),
    xfail("complex"),
    xfail("count_nonzero"),
@ -661,36 +661,6 @@ class TestDTensorOps(DTensorOpTestBase):
            sample_inputs_filter=lambda s: s.kwargs["num_classes"] != -1,
        )

-    def test_mean(self):
-        self.mesh = init_device_mesh(DEVICE_TYPE, (self.world_size,))
-
-        shape = [2 * self.world_size + 1, 2 * self.world_size]
-        tensor = (
-            torch.arange(shape[0] * shape[1], dtype=torch.float32)
-            .reshape(shape)
-            .to(DEVICE_TYPE)
-        )
-
-        for is_evenly_shardable in [True]:
-            if is_evenly_shardable:
-                placement = [Shard(1)]
-                reduce_dim = 1
-            else:
-                placement = [Shard(0)]
-                reduce_dim = 0
-            dtensor = distribute_tensor(tensor, self.mesh, placement)
-
-            with DebugMode(record_torchfunction=False) as debug_mode:
-                mean = dtensor.mean(dim=reduce_dim)
-                full_tensor = mean.full_tensor()
-
-            self.assertEqual(full_tensor, tensor.mean(dim=reduce_dim))
-
-            if is_evenly_shardable:
-                self.assertFalse("redistribute_input" in debug_mode.debug_string())
-            else:
-                self.assertTrue("redistribute_input" in debug_mode.debug_string())
-

 # only instantiate tests for DEVICE_TYPE alone (i.e. either CPU or GPU)
 instantiate_device_type_tests(TestDTensorOps, globals(), only_for=(DEVICE_TYPE,))
--- a/test/distributed/tensor/test_op_schema.py
+++ b/test/distributed/tensor/test_op_schema.py
@ -1,10 +1,8 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
 # Owner(s): ["oncall: distributed"]

-import random
-
 from torch.distributed.tensor._dtensor_spec import DTensorSpec
-from torch.distributed.tensor._op_schema import OpSchema, RuntimeSchemaInfo
+from torch.distributed.tensor._op_schema import OpSchema
 from torch.testing._internal.common_utils import run_tests, TestCase


@ -12,108 +10,12 @@ class TestOpSchema(TestCase):
    def test_equality_checks_lists_of_dtensor_spec(self):
        """If x == y, then we must have h(x) == h(y)."""
        dts = DTensorSpec(mesh=None, placements=tuple(), tensor_meta=None)
-        schema1 = OpSchema(op=None, args_schema=(dts, [dts]), kwargs_schema={})
-        schema2 = OpSchema(op=None, args_schema=(dts, [dts, dts]), kwargs_schema={})
+        schema1 = OpSchema(op=None, args_schema=[dts, [dts]], kwargs_schema={})
+        schema2 = OpSchema(op=None, args_schema=[dts, [dts, dts]], kwargs_schema={})
        # This is a regression test; these schemas used to compare equal.
        self.assertNotEqual(schema1, schema2)
        self.assertNotEqual(hash(schema1), hash(schema2))

-    def test_equality_respects_static_attributes(self):
-        def _get_sample_op_schemas(static_arg_val, static_kwarg_val):
-            dts = DTensorSpec(mesh=None, placements=tuple(), tensor_meta=None)
-            static_argnum = 2
-            static_kwargkey = ["statickwarg"]
-            annotated_schemas = [
-                (False, False, None),
-                (True, False, RuntimeSchemaInfo(static_argnum=static_argnum)),
-                (False, True, RuntimeSchemaInfo(static_kwargkey=static_kwargkey)),
-                (
-                    True,
-                    True,
-                    RuntimeSchemaInfo(
-                        static_argnum=static_argnum, static_kwargkey=static_kwargkey
-                    ),
-                ),
-            ]
-
-            # non-tensor args show up in hash iff the argnum is static/
-            # kwargs show up in hash iff their name is in static_kwargkey.
-            # random elements are random because they are not supposed to matter for
-            # equality at all.
-            args_schema = (dts, random.randint(1, 1000000), static_arg_val)
-            kwargs_schema = {
-                "ignoredkwarg": random.randint(1, 1000000),
-                "statickwarg": static_kwarg_val,
-            }
-            return [
-                (
-                    has_static_arg,
-                    has_static_kwarg,
-                    OpSchema(
-                        op=None,
-                        args_schema=args_schema,
-                        kwargs_schema=kwargs_schema,
-                        schema_info=si,
-                    ),
-                )
-                for (has_static_arg, has_static_kwarg, si) in annotated_schemas
-            ]
-
-        for lhs_has_static_arg, lhs_has_static_kwarg, lhs in _get_sample_op_schemas(
-            1, 2
-        ):
-            # Static arg/kwarg both match
-            for rhs_has_static_arg, rhs_has_static_kwarg, rhs in _get_sample_op_schemas(
-                1, 2
-            ):
-                if (
-                    lhs_has_static_arg == rhs_has_static_arg
-                    and lhs_has_static_kwarg == rhs_has_static_kwarg
-                ):
-                    self.assertEqual(lhs, rhs)
-                else:
-                    self.assertNotEqual(lhs, rhs)
-
-            # Static arg mismatch
-            for rhs_has_static_arg, rhs_has_static_kwarg, rhs in _get_sample_op_schemas(
-                3, 2
-            ):
-                if (
-                    lhs_has_static_arg
-                    or rhs_has_static_arg
-                    or lhs_has_static_kwarg != rhs_has_static_kwarg
-                ):
-                    self.assertNotEqual(lhs, rhs)
-                else:
-                    self.assertEqual(lhs, rhs)
-
-            # Static kwarg mismatch
-            for rhs_has_static_arg, rhs_has_static_kwarg, rhs in _get_sample_op_schemas(
-                1, 3
-            ):
-                if (
-                    lhs_has_static_kwarg
-                    or rhs_has_static_kwarg
-                    or lhs_has_static_arg != rhs_has_static_arg
-                ):
-                    self.assertNotEqual(lhs, rhs)
-                else:
-                    self.assertEqual(lhs, rhs)
-
-            # Static arg/kwarg both mismatch
-            for rhs_has_static_arg, rhs_has_static_kwarg, rhs in _get_sample_op_schemas(
-                3, 4
-            ):
-                if (
-                    lhs_has_static_arg
-                    or rhs_has_static_arg
-                    or lhs_has_static_kwarg
-                    or rhs_has_static_kwarg
-                ):
-                    self.assertNotEqual(lhs, rhs)
-                else:
-                    self.assertEqual(lhs, rhs)
-

 if __name__ == "__main__":
    run_tests()
--- a/test/distributed/test_nvshmem_triton.py
+++ b/test/distributed/test_nvshmem_triton.py
@ -63,20 +63,22 @@ def nvshmem_get_kernel(

@triton.jit
 def nvshmem_putmem_signal_block_kernel(
-    dst,
-    src,
+    dst_ptr,
+    src_ptr,
    size_bytes,
-    signal,
-    sig_val,
+    sig_ptr,
+    signal_val,
    sig_op,
    peer,
 ):
-    nvshmem.putmem_signal_block(dst, src, size_bytes, signal, sig_val, sig_op, peer)
+    nvshmem.putmem_signal_block(
+        dst_ptr, src_ptr, size_bytes, sig_ptr, signal_val, sig_op, peer
+    )


@triton.jit
-def nvshmem_signal_wait_until_kernel(signal, cmp_op, cmp_val):
-    nvshmem.signal_wait_until(signal, cmp_op, cmp_val)
+def nvshmem_signal_wait_until_kernel(sig_ptr, cmp_op, cmp_val):
+    nvshmem.signal_wait_until(sig_ptr, cmp_op, cmp_val)


@triton.jit
@ -417,7 +419,7 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
        val = 11
        inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val)
        out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
-        symm_mem.rendezvous(inp, group=group_name)
+        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
        out_hdl = symm_mem.rendezvous(out, group=group_name)

        # Use the signal pad attached to the output symmetric memory handle
@ -431,12 +433,15 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):

        if rank == 0:
            # Rank 0 puts into Rank 1
+            dst_ptr = out_hdl.buffer_ptrs[peer]
+            src_ptr = inp_hdl.buffer_ptrs[rank]
+            sig_ptr = out_hdl.signal_pad_ptrs[peer]
            nvshmem_putmem_signal_block_kernel[(1, 1, 1)](
-                out,
-                inp,
+                dst_ptr,
+                src_ptr,
                size_bytes=msg_size_bytes,
-                signal=flag,
-                sig_val=SIGNAL_VAL,
+                sig_ptr=sig_ptr,
+                signal_val=SIGNAL_VAL,
                sig_op=NVSHMEM_SIGNAL_SET,
                peer=peer,
                extern_libs=nvshmem_lib,
@ -444,8 +449,9 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):

        if rank == 1:
            # Wait until signal flag is set by Rank 0
+            sig_ptr_local = out_hdl.signal_pad_ptrs[rank]
            nvshmem_signal_wait_until_kernel[(1,)](
-                flag,
+                sig_ptr_local,
                cmp_op=NVSHMEM_CMP_EQ,
                cmp_val=SIGNAL_VAL,
                extern_libs=nvshmem_lib,
@ -479,7 +485,7 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
        val = 11
        inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val)
        out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
-        symm_mem.rendezvous(inp, group=group_name)
+        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
        out_hdl = symm_mem.rendezvous(out, group=group_name)

        # Use the signal pad attached to the output symmetric memory handle
@ -493,20 +499,24 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):

        if rank == 0:
            # Rank 0 puts into Rank 1
+            dst_ptr = out_hdl.buffer_ptrs[peer]
+            src_ptr = inp_hdl.buffer_ptrs[rank]
+            sig_ptr = out_hdl.signal_pad_ptrs[peer]
            nvshmem_putmem_signal_block_kernel[(1, 1, 1)](
-                out,
-                inp,
+                dst_ptr,
+                src_ptr,
                size_bytes=msg_size_bytes,
-                signal=flag,
-                sig_val=SIGNAL_VAL,
+                sig_ptr=sig_ptr,
+                signal_val=SIGNAL_VAL,
                sig_op=NVSHMEM_SIGNAL_ADD,
                peer=peer,
                extern_libs=nvshmem_lib,
            )

        if rank == 1:
+            sig_ptr_local = out_hdl.signal_pad_ptrs[rank]
            nvshmem_signal_wait_until_kernel[(1, 1, 1)](
-                flag,
+                sig_ptr_local,
                cmp_op=NVSHMEM_CMP_EQ,
                cmp_val=SIGNAL_VAL,
                extern_libs=nvshmem_lib,
@ -536,13 +546,10 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
        FLAG_FINAL_VALUE = 42

        # Use a single int64 symmetric tensor as our synchronization flag.
-        flag = symm_mem.empty(1, dtype=torch.int32, device=self.device).fill_(
+        flag = symm_mem.empty(1, dtype=torch.int64, device=self.device).fill_(
            FLAG_INITIAL_VALUE
        )
        symm_mem.rendezvous(flag, group=group_name)
-        expected_flag = torch.tensor(
-            [FLAG_FINAL_VALUE], dtype=torch.int32, device=self.device
-        )

        nvshmem_barrier_all_kernel[(1,)](extern_libs=nvshmem_lib)

@ -558,15 +565,19 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
            # Verification
            torch.testing.assert_close(
                flag,
-                expected_flag,
+                torch.tensor([FLAG_FINAL_VALUE], dtype=torch.int64, device=self.device),
            )

        if rank == 1:
            # Rank 1 (the signaler)
+            val_to_put = torch.tensor(
+                [FLAG_FINAL_VALUE], dtype=torch.int64, device=self.device
+            )
+
            # Launch a kernel to put the value to Rank 0's flag tensor.
            nvshmem_put_kernel[(1,)](
                flag,  # Destination symmetric tensor on the remote PE
-                expected_flag,  # Source data tensor (local)
+                val_to_put,  # Source data tensor (local)
                1,  # Number of elements
                peer,  # The target PE (Rank 0)
                extern_libs=nvshmem_lib,
@ -598,7 +609,7 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):

        # Producer (rank 0) prepares the data to send
        inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val_to_put)
-        symm_mem.rendezvous(inp, group=group_name)
+        inp_hdl = symm_mem.rendezvous(inp, group=group_name)
        # Consumer (rank 1) prepares the destination buffer
        out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
        out_hdl = symm_mem.rendezvous(out, group=group_name)
@ -608,20 +619,24 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):

        if rank == 0:
            # Producer (rank 0): Puts data into rank 1's `out` buffer and then sets the flag
+            dst_ptr = out_hdl.buffer_ptrs[peer]
+            src_ptr = inp_hdl.buffer_ptrs[rank]
+            sig_ptr = out_hdl.signal_pad_ptrs[peer]
            nvshmem_putmem_signal_block_kernel[(1, 1, 1)](
-                out,
-                inp,
+                dst_ptr,
+                src_ptr,
                size_bytes=msg_size_bytes,
-                signal=flag,
-                sig_val=COMPLETION_FLAG_VAL,
+                sig_ptr=sig_ptr,
+                signal_val=COMPLETION_FLAG_VAL,
                sig_op=NVSHMEM_SIGNAL_SET,
                peer=peer,
                extern_libs=nvshmem_lib,
            )
        elif rank == 1:
            # Consumer (rank 1): Waits on the signal variable using `signal_wait_until`.
+            sig_ptr = out_hdl.signal_pad_ptrs[rank]
            nvshmem_signal_wait_until_kernel[(1, 1, 1)](
-                flag,
+                sig_ptr,
                cmp_op=NVSHMEM_CMP_EQ,
                cmp_val=COMPLETION_FLAG_VAL,
                extern_libs=nvshmem_lib,
@ -674,10 +689,10 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
        symm_mem.rendezvous(out2, group=group_name)

        # Use regular symmetric memory tensor for flag
-        flag = symm_mem.empty(1, dtype=torch.int32, device=self.device).fill_(0)
+        flag = symm_mem.empty(1, dtype=torch.int64, device=self.device).fill_(0)
        symm_mem.rendezvous(flag, group=group_name)
        flag_update_val = torch.tensor(
-            [flag_val], dtype=torch.int32, device=self.device
+            [flag_val], dtype=torch.int64, device=self.device
        )
        NVSHMEM_CMP_EQ = 0  # compare equal

@ -710,7 +725,7 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
                out2, val2 * torch.ones(numel, dtype=dtype, device=self.device)
            )
            torch.testing.assert_close(
-                flag, torch.tensor([flag_val], dtype=torch.int32, device=self.device)
+                flag, torch.tensor([flag_val], dtype=torch.int64, device=self.device)
            )

    @skipIfRocm
@ -732,9 +747,9 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):

        inp = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(val)
        out = symm_mem.empty(numel, dtype=dtype, device=self.device).fill_(-1)
-        flag = symm_mem.empty(1, dtype=torch.int32, device=self.device).fill_(0)
+        flag = symm_mem.empty(1, dtype=torch.int64, device=self.device).fill_(0)
        flag_update_val = torch.tensor(
-            [flag_val], dtype=torch.int32, device=self.device
+            [flag_val], dtype=torch.int64, device=self.device
        )

        symm_mem.rendezvous(inp, group=group_name)
@ -967,7 +982,7 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
            torch.uint8,
            torch.float16,
            torch.float32,
-            # torch.float64,  # Tensor-likes are not close
+            torch.float64,
            torch.bfloat16,
        ],
    )
@ -1120,7 +1135,7 @@ class NVSHMEMTritonTest(MultiProcContinuousTest):
            torch.int64,
            torch.float16,
            torch.float32,
-            # torch.float64,  # Tensor-likes are not close
+            torch.float64,
            torch.bfloat16,
        ],
    )
--- a/test/dynamo/test_backends.py
+++ b/test/dynamo/test_backends.py
@ -386,7 +386,7 @@ class TestCustomBackendAPI(torch._dynamo.test_case.TestCase):
        self.assertTrue(backend_run)


-devices = ["cpu", "cuda", "hpu", "xpu"]
+devices = ["cpu", "cuda", "hpu"]
 instantiate_device_type_tests(TestOptimizations, globals(), only_for=devices)

 if __name__ == "__main__":
--- a/test/dynamo/test_callback.py
+++ b/test/dynamo/test_callback.py
@ -8,12 +8,7 @@ from torch._dynamo.callback import callback_handler, CallbackArgs, CallbackTrigg
 from torch._dynamo.test_case import run_tests, TestCase
 from torch._guards import CompileId
 from torch.testing._internal.common_utils import TEST_WITH_ROCM
-from torch.testing._internal.triton_utils import HAS_CUDA_AND_TRITON, requires_gpu
-
-
-device_type = (
-    acc.type if (acc := torch.accelerator.current_accelerator(True)) else "cpu"
-)
+from torch.testing._internal.triton_utils import requires_cuda_and_triton


 class CallbackTests(TestCase):
@ -66,7 +61,7 @@ class CallbackTests(TestCase):
    @unittest.skipIf(
        TEST_WITH_ROCM, "ROCm outputs a different number of autotuning logs"
    )
-    @requires_gpu
+    @requires_cuda_and_triton
    @torch._inductor.config.patch(force_disable_caches=True)
    def test_triggers(self) -> None:
        torch._dynamo.reset()
@ -96,9 +91,9 @@ class CallbackTests(TestCase):
                torch._dynamo.graph_break()
                return self.fc2(temp)

-        model = TinyModel().to(device_type)
+        model = TinyModel().to("cuda")
        compiled_model = torch.compile(model, mode="max-autotune")
-        x = torch.randn(10, 10, device=device_type)
+        x = torch.randn(10, 10, device="cuda")

        loss = compiled_model(x).sum()
        loss.backward()
@ -116,13 +111,9 @@ end=CallbackArgs(callback_trigger=<CallbackTrigger.LAZY_BACKWARD: 2>, compile_id
        )
        order.clear()

-        if not HAS_CUDA_AND_TRITON:
-            return
-
        compiled_model.zero_grad()
        loss = compiled_model(x).sum()
        loss.backward()
-
        self.assertExpectedInline(
            "\n".join(order),
            """\
--- a/test/dynamo/test_error_messages.py
+++ b/test/dynamo/test_error_messages.py
@ -48,6 +48,27 @@ class GenericCtxMgr:


 class ErrorMessagesTest(LoggingTestCase):
+    def test_dynamic_shape_operator(self):
+        def fn():
+            return torch.nonzero(torch.rand([10, 10]))
+
+        self.assertExpectedInlineMunged(
+            Unsupported,
+            lambda: torch.compile(fn, backend="eager", fullgraph=True)(),
+            """\
+Dynamic shape operator
+  Explanation: Operator `aten.nonzero.default`'s output shape depends on input Tensor data.
+  Hint: Enable tracing of dynamic shape operators with `torch._dynamo.config.capture_dynamic_output_shape_ops = True`
+
+  Developer debug context: aten.nonzero.default
+
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0036.html
+
+from user code:
+   File "test_error_messages.py", line N, in fn
+    return torch.nonzero(torch.rand([10, 10]))""",
+        )
+
    def test_dynamic_shape_operator_no_meta_kernel(self):
        def fn():
            return torch.linalg.lstsq(torch.rand(10, 10), torch.rand(10, 10))
@ -70,6 +91,29 @@ from user code:
    return torch.linalg.lstsq(torch.rand(10, 10), torch.rand(10, 10))""",
            )

+    def test_data_dependent_operator(self):
+        def fn(x):
+            return x.item()
+
+        self.assertExpectedInlineMunged(
+            Unsupported,
+            lambda: torch.compile(fn, backend="eager", fullgraph=True)(
+                torch.Tensor([1])
+            ),
+            """\
+Unsupported Tensor.item() call with capture_scalar_outputs=False
+  Explanation: Dynamo does not support tracing `Tensor.item()` with config.capture_scalar_outputs=False.
+  Hint: Set `torch._dynamo.config.capture_scalar_outputs = True` or `export TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1` to include these operations in the captured graph.
+
+  Developer debug context: call_method TensorVariable() item () {}
+
+ For more details about this graph break, please visit: https://meta-pytorch.github.io/compile-graph-break-site/gb/gb0124.html
+
+from user code:
+   File "test_error_messages.py", line N, in fn
+    return x.item()""",
+        )
+
    def test_data_dependent_operator2(self):
        def fn(x):
            return torch.equal(x, x)
--- a/test/dynamo/test_functions.py
+++ b/test/dynamo/test_functions.py
@ -40,16 +40,11 @@ from torch.testing._internal.common_utils import (
    instantiate_parametrized_tests,
    parametrize,
 )
-from torch.testing._internal.inductor_utils import HAS_GPU

 # Defines all the kernels for tests
 from torch.testing._internal.triton_utils import *  # noqa: F403


-device_type = (
-    acc.type if (acc := torch.accelerator.current_accelerator(True)) else "cpu"
-)
-
 T = TypeVar("T")

 d = torch.ones(10, 10)
@ -1155,10 +1150,10 @@ class FunctionTests(torch._dynamo.test_case.TestCase):
        m = a.to(torch.float16)
        return b.type(m.type())

-    @unittest.skipIf(not HAS_GPU, "requires gpu")
+    @unittest.skipIf(not torch.cuda.is_available(), "requires cuda")
    @make_test
    def test_tensor_type2(a, b):
-        m = a.to(device_type)
+        m = a.to("cuda")
        return m + b.type(m.type())

    @make_test
@ -2087,12 +2082,6 @@ class FunctionTests(torch._dynamo.test_case.TestCase):
        mytuple = FunctionTests.MyNamedTuple(a, b)
        return mytuple.add(), mytuple.static_method(), mytuple.class_method()

-    @make_test
-    def test_namedtuple_replace(a, b):
-        mytuple = FunctionTests.MyNamedTuple(a, b)
-        replaced = mytuple._replace(first=b)
-        return mytuple.first + mytuple.second + replaced.first + replaced.second
-
    @make_test
    def test_generic_namedtuple_user_methods(a, b):
        mytuple = FunctionTests.MyGenericNamedTuple(a, b)
@ -4051,7 +4040,7 @@ class GraphModule(torch.nn.Module):
        def f1():
            mod1 = torch.get_device_module()
            mod2 = torch.get_device_module("cpu")
-            mod3 = torch.get_device_module(torch.device(device_type))
+            mod3 = torch.get_device_module(torch.device("cuda"))
            return mod1, mod2, mod3

        self.assertEqual(f1(), torch.compile(f1, backend="eager", fullgraph=True)())
@ -4086,7 +4075,6 @@ class GraphModule(torch.nn.Module):
        new_device = (
            "cpu" if torch._C._get_accelerator() == torch.device("cuda") else "cuda"
        )
-
        old_get_device_module = torch.get_device_module

        def new_get_device_module(device=None):
@ -4733,12 +4721,10 @@ class DefaultsTests(torch._dynamo.test_case.TestCase):
            opt_fn(x, ys, zs[:1])

    @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
-    def test_gpu_current_device(self):
+    def test_cuda_current_device(self):
        def fn(x):
            y = torch.empty(
-                (2, 3),
-                dtype=torch.float32,
-                device=torch.accelerator.current_device_index(),
+                (2, 3), dtype=torch.float32, device=torch.cuda.current_device()
            )
            y.copy_(x)
            return torch.sin(y + y.device.index)
@ -4746,11 +4732,11 @@ class DefaultsTests(torch._dynamo.test_case.TestCase):
        counter = torch._dynamo.testing.CompileCounter()
        opt_fn = torch.compile(backend=counter, fullgraph=True)(fn)

-        with torch.accelerator.device_index(0):
+        with torch.cuda.device(0):
            x = torch.randn(2, 3)
            self.assertEqual(opt_fn(x), fn(x))
            self.assertEqual(counter.frame_count, 1)
-            with torch.accelerator.device_index(1):
+            with torch.cuda.device(1):
                self.assertEqual(opt_fn(x), fn(x))
                self.assertEqual(counter.frame_count, 2)

--- a/test/dynamo/test_graph_region_tracker.py
+++ b/test/dynamo/test_graph_region_tracker.py
@ -1,5 +1,6 @@
 # Owner(s): ["module: dynamo"]
 import contextlib
+import os

 import torch
 import torch.fx
@ -195,6 +196,21 @@ class GraphRegionTrackerTests(TestCase):
        )

    def test_mismatched_global_state(self):
+        @contextlib.contextmanager
+        def _hip_allow_tf32():
+            # for HIP/AMDGPU, tf32 is behind a flag because the TF32 support is new
+            # and only for MI300+
+            hip_allow_tf32 = os.environ.get("HIPBLASLT_ALLOW_TF32", None)
+            os.environ["HIPBLASLT_ALLOW_TF32"] = "1"
+
+            try:
+                yield
+            finally:
+                if hip_allow_tf32 is not None:
+                    os.environ["HIPBLASLT_ALLOW_TF32"] = hip_allow_tf32
+                else:
+                    del os.environ["HIPBLASLT_ALLOW_TF32"]
+
        def inner_fn(x, y):
            x1 = x * 1
            y1 = y + 1
@ -235,29 +251,31 @@ class GraphRegionTrackerTests(TestCase):
        def reset_default_dtype():
            torch.set_default_dtype(old_dtype)

-        for ctx in [
-            lambda: torch.set_grad_enabled(False),
-            torch.autograd.grad_mode.inference_mode,
-            lambda: torch.autograd.graph.disable_saved_tensors_hooks(
-                "This is not supported"
-            ),
-            # lambda: torch.set_num_threads(2), : Unsupported
-            (set_default_dtype_bfloat16, reset_default_dtype),
-            (
-                lambda: torch.use_deterministic_algorithms(True),
-                lambda: torch.use_deterministic_algorithms(False),
-            ),
-            # (lambda: torch.use_deterministic_algorithms(True, warn_only=True),
-            # lambda: torch.use_deterministic_algorithms(False)), : Unsupported
-            create_toggle_fns("allow_bf16_reduced_precision_reduction"),
-            create_toggle_fns("allow_fp16_reduced_precision_reduction"),
-            create_toggle_fns("allow_tf32"),
-        ]:
-            self.assertExpectedInline(
-                self.get_result(fn, torch.rand(10, 10), torch.ones(10, 20), ctx),
-                """[[['x1_2', 'y1_2', 'sum_3', 'o0'], ['x1_3', 'y1_3', 'sum_4', 'o2']], \
+        tf32_ctx = _hip_allow_tf32 if torch.version.hip else contextlib.nullcontext
+        with tf32_ctx():
+            for ctx in [
+                lambda: torch.set_grad_enabled(False),
+                torch.autograd.grad_mode.inference_mode,
+                lambda: torch.autograd.graph.disable_saved_tensors_hooks(
+                    "This is not supported"
+                ),
+                # lambda: torch.set_num_threads(2), : Unsupported
+                (set_default_dtype_bfloat16, reset_default_dtype),
+                (
+                    lambda: torch.use_deterministic_algorithms(True),
+                    lambda: torch.use_deterministic_algorithms(False),
+                ),
+                # (lambda: torch.use_deterministic_algorithms(True, warn_only=True),
+                # lambda: torch.use_deterministic_algorithms(False)), : Unsupported
+                create_toggle_fns("allow_bf16_reduced_precision_reduction"),
+                create_toggle_fns("allow_fp16_reduced_precision_reduction"),
+                create_toggle_fns("allow_tf32"),
+            ]:
+                self.assertExpectedInline(
+                    self.get_result(fn, torch.rand(10, 10), torch.ones(10, 20), ctx),
+                    """[[['x1_2', 'y1_2', 'sum_3', 'o0'], ['x1_3', 'y1_3', 'sum_4', 'o2']], \
 [['x1', 'y1', 'sum_1', 'o4'], ['x1_1', 'y1_1', 'sum_2', 'o5']]]""",
-            )
+                )

    def test_mutation_tracking_simple(self):
        def fn(x, y, z):
--- a/test/dynamo/test_guard_manager.py
+++ b/test/dynamo/test_guard_manager.py
@ -116,6 +116,8 @@ num_guards_executed=0)
        const_guard = guards.LAMBDA_GUARD(
            root,
            functools.partial(equals_match, expected=5),
+            {},
+            False,
            equals_match_verbose_code_parts(5),
        )
        self.assertTrue(const_guard(5))
@ -405,10 +407,14 @@ num_guards_executed=0)
        guard_manager.add_type_match_guard(id_type(5), ["type(x) == int"])
        guard_manager.add_lambda_guard(
            functools.partial(ge_match, expected=5),
+            {},
+            False,
            ge_match_verbose_code_parts(expected=5),
        )
        guard_manager.add_lambda_guard(
            functools.partial(less_match, expected=10),
+            {},
+            False,
            less_match_verbose_code_parts(expected=10),
        )
        self.assertEqual(len(guard_manager.get_leaf_guards()), 3)
@ -428,10 +434,14 @@ num_guards_executed=0)
        guard_manager.add_type_match_guard(id_type(foo), ["type(x) == Foo"])
        guard_manager.getattr_manager("x", "x", 1, default_mgr_enum).add_lambda_guard(
            functools.partial(equals_match, expected=foo.x),
+            {},
+            False,
            equals_match_verbose_code_parts(foo.x),
        )
        guard_manager.getattr_manager("y", "y", 2, default_mgr_enum).add_lambda_guard(
            functools.partial(equals_match, expected=foo.y),
+            {},
+            False,
            equals_match_verbose_code_parts(foo.y),
        )
        self.assertEqual(len(guard_manager.get_leaf_guards()), 1)
@ -474,10 +484,14 @@ num_guards_executed=0)
        guard_manager.add_type_match_guard(id_type(foo), ["type(x) == Foo"])
        guard_manager.getitem_manager(0, "", 1, default_mgr_enum).add_lambda_guard(
            functools.partial(equals_match, expected=foo[0]),
+            {},
+            False,
            equals_match_verbose_code_parts(foo[0]),
        )
        guard_manager.getitem_manager(1, "", 2, default_mgr_enum).add_lambda_guard(
            functools.partial(equals_match, expected=foo[1]),
+            {},
+            False,
            equals_match_verbose_code_parts(foo[1]),
        )
        self.assertEqual(len(guard_manager.get_leaf_guards()), 1)
@ -585,6 +599,8 @@ num_guards_executed=0)
            lambda x: isinstance(x, Pair)
            and isinstance(x.x, torch.Tensor)
            and isinstance(x.y, int),
+            {},
+            False,
            "global guard fail",
        )

@ -635,6 +651,8 @@ num_guards_executed=0)
        )
        attr_manager.add_lambda_guard(
            lambda x: x == 4,
+            {},
+            False,
            "Expected value 4",
        )

@ -675,6 +693,8 @@ num_guards_executed=0)

        weakref_manager.add_lambda_guard(
            lambda x: isinstance(x, torch.Tensor),
+            {},
+            False,
            "global weakref fail",
        )

@ -694,6 +714,8 @@ num_guards_executed=0)
        )
        foo_mgr.add_lambda_guard(
            lambda x: x == 3,
+            {},
+            False,
            "Expected value 3",
        )
        self.assertTrue(guard_manager.check(a))
@ -779,7 +801,7 @@ num_guards_executed=0)
        # Add key-value manager (nothing : {"z" : 3})
        self.assertTrue(root.check(f_locals))
        dict_mgr.get_key_manager(1, "", nothing, default_mgr_enum).add_lambda_guard(
-            lambda x: x is nothing, ["x is nothing"]
+            lambda x: x is nothing, {}, False, ["x is nothing"]
        )
        self.assertTrue(root.check(f_locals))
        value_mgr = dict_mgr.get_value_manager(
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@ -7207,7 +7207,9 @@ utils_device.CURRENT_DEVICE == None""".split("\n"):
            return x + 1

        guard_manager = torch._dynamo.guards.RootGuardManager()
-        guard_manager.add_lambda_guard(lambda L: isinstance(L["x"], int), [])
+        guard_manager.add_lambda_guard(
+            lambda L: isinstance(L["x"], int), {"x": 0}, True, []
+        )

        def injected(x):
            return x + 42
@ -7232,27 +7234,33 @@ utils_device.CURRENT_DEVICE == None""".split("\n"):
            return x + 1

        guard_manager_bool = torch._dynamo.guards.RootGuardManager()
-        guard_manager_bool.add_lambda_guard(lambda L: isinstance(L["x"], bool), [])
+        guard_manager_bool.add_lambda_guard(
+            lambda L: isinstance(L["x"], bool), {"x": 0}, True, []
+        )

        def injected_bool(x: bool):
            return x + 102

        guard_manager_int = torch._dynamo.guards.RootGuardManager()
-        guard_manager_int.add_lambda_guard(lambda L: isinstance(L["x"], int), [])
+        guard_manager_int.add_lambda_guard(
+            lambda L: isinstance(L["x"], int), {"x": 0}, True, []
+        )

        def injected_int(x: int):
            return x + 42

        guard_manager_tensor = torch._dynamo.guards.RootGuardManager()
        guard_manager_tensor.add_lambda_guard(
-            lambda L: isinstance(L["x"], torch.Tensor), []
+            lambda L: isinstance(L["x"], torch.Tensor), {"x": 0}, True, []
        )

        def injected_tensor(x: torch.Tensor):
            return x + 100

        guard_manager_str = torch._dynamo.guards.RootGuardManager()
-        guard_manager_str.add_lambda_guard(lambda L: isinstance(L["x"], str), [])
+        guard_manager_str.add_lambda_guard(
+            lambda L: isinstance(L["x"], str), {"x": 0}, True, []
+        )

        def injected_str(x: str):
            return x + "1"
@ -7329,7 +7337,10 @@ utils_device.CURRENT_DEVICE == None""".split("\n"):

        guard_manager_bool = torch._dynamo.guards.RootGuardManager()
        guard_manager_bool.add_lambda_guard(
-            lambda L: isinstance(L["x"], bool), ["isinstance(L['x'], bool)"]
+            lambda L: isinstance(L["x"], bool),
+            {"x": 0},
+            True,
+            ["isinstance(L['x'], bool)"],
        )

        def injected_bool(x: bool):
@ -8467,24 +8478,43 @@ utils_device.CURRENT_DEVICE == None""".split("\n"):
        def fn(x):
            return x + 1

-        initial_state = read_state()
-        y = torch.randn(10)
-        try:
-            for round in range(3):
-                for i in range(len(initial_state)):
-                    new_state = [False] * len(initial_state)
-                    new_state[i] = True
-                    write_state(new_state)
-                    assert read_state() == new_state
-                    last_state.clear()
-                    fn(y)
-                    assert last_state == new_state
-                    if round == 0:
-                        assert cnt == i + 1
-                    else:
-                        assert cnt == len(initial_state)
-        finally:
-            write_state(initial_state)
+        import contextlib
+
+        @contextlib.contextmanager
+        def _hip_allow_tf32():
+            # for HIP/AMDGPU, tf32 is behind a flag because the TF32 support is new
+            # and only for MI300+
+            hip_allow_tf32 = os.environ.get("HIPBLASLT_ALLOW_TF32", None)
+            os.environ["HIPBLASLT_ALLOW_TF32"] = "1"
+
+            try:
+                yield
+            finally:
+                if hip_allow_tf32 is not None:
+                    os.environ["HIPBLASLT_ALLOW_TF32"] = hip_allow_tf32
+                else:
+                    del os.environ["HIPBLASLT_ALLOW_TF32"]
+
+        tf32_ctx = _hip_allow_tf32 if torch.version.hip else contextlib.nullcontext
+        with tf32_ctx():
+            initial_state = read_state()
+            y = torch.randn(10)
+            try:
+                for round in range(3):
+                    for i in range(len(initial_state)):
+                        new_state = [False] * len(initial_state)
+                        new_state[i] = True
+                        write_state(new_state)
+                        assert read_state() == new_state
+                        last_state.clear()
+                        fn(y)
+                        assert last_state == new_state
+                        if round == 0:
+                            assert cnt == i + 1
+                        else:
+                            assert cnt == len(initial_state)
+            finally:
+                write_state(initial_state)

    def test_grad_state_mutated(self):
        prior = torch.is_grad_enabled()
@ -13263,7 +13293,7 @@ class MiscTestsDevice(torch._inductor.test_case.TestCase):
        self.assertEqual(out, opt_out)

    @unittest.skipIf(not TEST_MULTIGPU, "need multiple GPU")
-    def test_gpu_set_device(self, device):
+    def test_cuda_set_device(self, device):
        def fn():
            a = torch.ones(2, device=device)
            torch.get_device_module(device).set_device(1)
@ -13343,26 +13373,6 @@ class MiscTestsDevice(torch._inductor.test_case.TestCase):
        y = torch.tensor(5)
        f(x, y)

-    def test_full_graph_capture_scalar_outputs(self):
-        @torch.compile(fullgraph=True)
-        def foo(a):
-            return torch.randn(5) * a.item()
-
-        # We expect to no longer raise here
-        foo(torch.tensor(2.0))
-
-    def test_full_graph_capture_dynamic_output_shape_ops(self):
-        def fn(x):
-            nz = torch.nonzero(x)
-            squared = nz * nz
-            sliced = torch.ops.aten.slice.Tensor(squared, dim=1, start=-2, end=None)
-            view = sliced.unsqueeze(dim=0)
-            return view.squeeze(dim=0)
-
-        example_inputs = (torch.randn(1, 1, 1, 1),)
-        # we expect to no longer raise here
-        torch.compile(fn, fullgraph=True)(*example_inputs)
-
    def test_dynamic_float_scalar_tensor_coersion(self):
        # Minified version of https://github.com/pytorch/pytorch/issues/158376#issuecomment-3079591367
        class Foo:
--- a/test/dynamo/test_precompile_context.py
+++ b/test/dynamo/test_precompile_context.py
@ -47,8 +47,7 @@ class PrecompileContextTests(InductorTestCase):
        x = torch.randn(10, device=GPU_TYPE, requires_grad=True)
        result = compiled_fn(x)
        result.sum().backward()
-        self.assertEqual(len(PrecompileContext._dynamo_cache_entries), 1)
-        self.assertEqual(len(PrecompileContext._backend_artifacts_by_key), 1)
+        self.assertEqual(len(PrecompileContext._new_cache_artifacts_by_key), 2)
        self.assertEqual(len(PrecompileContext._new_cache_artifacts), 0)

        result = PrecompileContext.serialize()
@ -83,9 +82,8 @@ class PrecompileContextTests(InductorTestCase):
        x = torch.randn(10, device=GPU_TYPE, requires_grad=True)
        result = compiled_fn(x)
        result.sum().backward()
-        self.assertEqual(len(PrecompileContext._dynamo_cache_entries), 1)
-        self.assertEqual(len(PrecompileContext._backend_artifacts_by_key), 1)
-        for key in PrecompileContext._backend_artifacts_by_key.keys():
+        self.assertEqual(len(PrecompileContext._new_cache_artifacts_by_key), 2)
+        for key in PrecompileContext._new_cache_artifacts_by_key.keys():
            result = PrecompileContext.serialize_artifact_by_key(key)
            assert isinstance(result, PrecompileCacheArtifact)
            self.assertEqual(result.key, key)
@ -111,12 +109,11 @@ class PrecompileContextTests(InductorTestCase):
        x = torch.randn(10, device=GPU_TYPE, requires_grad=True)
        result = compiled_fn(x)
        result.sum().backward()
-        self.assertEqual(len(PrecompileContext._dynamo_cache_entries), 1)
-        self.assertEqual(len(PrecompileContext._backend_artifacts_by_key), 1)
+        self.assertEqual(len(PrecompileContext._new_cache_artifacts_by_key), 2)
        # Find the key for the artifact of type "precompile_aot_autograd"
        key = next(
            k
-            for k, v in PrecompileContext._backend_artifacts_by_key.items()
+            for k, v in PrecompileContext._new_cache_artifacts_by_key.items()
            if isinstance(v, EditablePrecompileCacheArtifact)
        )

--- a/test/dynamo/test_structured_trace.py
+++ b/test/dynamo/test_structured_trace.py
@ -109,8 +109,6 @@ class StructuredTraceTestingFormatter(logging.Formatter):
            metadata["dynamo_start"]["stack"] = "STACK"
        if "inductor_output_code" in metadata:
            metadata["inductor_output_code"]["filename"] = "FILENAME"
-            if "file_path" in metadata["inductor_output_code"]:
-                metadata["inductor_output_code"]["file_path"] = "FILENAME"
        if "stack" in metadata:
            metadata["stack"] = "STACK"
        if "compilation_metrics" in metadata:
@ -261,7 +259,7 @@ class StructuredTraceTest(TestCase):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "triton_kernel_info", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@ -295,7 +293,7 @@ class StructuredTraceTest(TestCase):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "triton_kernel_info", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@ -337,7 +335,7 @@ class StructuredTraceTest(TestCase):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
@ -359,7 +357,7 @@ class StructuredTraceTest(TestCase):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 1, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 1, "attempt": 0}
@ -391,7 +389,7 @@ class StructuredTraceTest(TestCase):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
@ -448,7 +446,7 @@ class StructuredTraceTest(TestCase):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
@ -457,7 +455,7 @@ class StructuredTraceTest(TestCase):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 2, "frame_compile_id": 0, "attempt": 1, "has_payload": "HASH"}
 {"bwd_compilation_metrics": "METRICS", "frame_id": 2, "frame_compile_id": 0, "attempt": 1}
 {"dynamo_start": {"stack": "STACK"}, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
@ -600,7 +598,7 @@ class StructuredTraceTest(TestCase):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aotautograd_cache_bypass", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_pre_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@ -613,7 +611,7 @@ class StructuredTraceTest(TestCase):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "aotautograd_cache_bypass", "encoding": "json"}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "rank": 0, "frame_id": 3, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
@ -681,7 +679,7 @@ class StructuredTraceTest(TestCase):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"describe_storage": {"id": 16, "describer_id": "ID", "size": 4194304}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
 {"describe_tensor": {"id": 29, "ndim": 2, "dtype": "torch.float32", "device": "device(type='cuda', index=0)", "size": [1024, 1024], "dynamo_hint_overrides": {}, "is_leaf": true, "requires_grad": true, "is_parameter": true, "stride": [1024, 1], "storage": 16, "view_func": "VIEW_FUNC", "describer_id": "ID"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
@ -700,7 +698,7 @@ class StructuredTraceTest(TestCase):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "rank": 0, "frame_id": 4, "frame_compile_id": 0, "attempt": 0}
@ -741,7 +739,7 @@ class StructuredTraceTest(TestCase):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 1, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 1, "frame_compile_id": 0, "attempt": 0}
@ -902,7 +900,7 @@ def forward(self, x, y):
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "before_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "after_post_grad_graph", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_cache_miss", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"dynamo_cpp_guards_str": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"compilation_metrics": "METRICS", "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
@ -917,7 +915,7 @@ def forward(self, x, y):
 {"aot_inference_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "fx_graph_runnable", "encoding": "string"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"inductor_post_grad_graph": {}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
-{"inductor_output_code": {"filename": "FILENAME", "file_path": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
+{"inductor_output_code": {"filename": "FILENAME"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
 {"artifact": {"name": "inductor_provenance_tracking_node_mappings", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"artifact": {"name": "inductor_provenance_tracking_kernel_stack_traces", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0}
 {"artifact": {"name": "fx_graph_cache_hit", "encoding": "json"}, "frame_id": 0, "frame_compile_id": 0, "attempt": 0, "has_payload": "HASH"}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Mark Saroufim	a8c367127b	Revert all changes to torch/cuda/_utils.py	2025-09-17 13:16:34 -07:00
Mark Saroufim	fa839e440c	testy test	2025-09-17 13:15:06 -07:00
Mark Saroufim	4ae58a3dd4	simplify nvrtc discovery logic	2025-09-17 13:15:06 -07:00
Mark Saroufim	8e8ec24374	Update _utils.py	2025-09-17 13:15:05 -07:00
Mark Saroufim	4b74106204	lint	2025-09-17 13:15:05 -07:00
Mark Saroufim	693880081c	cub and compile_kernel	2025-09-17 13:15:04 -07:00