nit cleanup

revert kineto
add unit tests. Handle some edge cases
2025-10-25 16:14:55 +08:00 · 2025-09-26 14:38:18 -07:00 · 2025-09-25 17:16:48 -07:00 · 2025-09-25 17:08:22 -07:00 · 2025-09-25 15:46:19 -07:00 · 2025-09-25 15:34:46 -07:00
145 changed files with 1668 additions and 2173 deletions
--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@ -35,11 +35,10 @@ fi

 print_cmake_info
 if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
-  # Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
-  USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
+  USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
 else
-  # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
-  # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
+  # NB: we always build with distributed; USE_DISTRIBUTED turns off all
+  # backends (specifically the gloo backend), so test that this case works too
  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
 fi
 if which sccache > /dev/null; then
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -13,9 +13,13 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
 fi
 popd

+python -mpip install -r requirements.txt
+
 # enable debug asserts in serialization
 export TORCH_SERIALIZATION_DEBUG=1

+python -mpip install --no-input -r requirements.txt
+
 setup_test_python() {
  # The CircleCI worker hostname doesn't resolve to an address.
  # This environment variable makes ProcessGroupGloo default to
--- a/.ci/wheel/build_wheel.sh
+++ b/.ci/wheel/build_wheel.sh
@ -177,7 +177,8 @@ source ~/${desired_python}-build/bin/activate
 retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements.txt"
 retry brew install libomp

-# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
+# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
+# is build as part of tensorpipe submodule
 export USE_DISTRIBUTED=1

 export USE_MKLDNN=OFF
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -127,8 +127,6 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      # More memory is needed to build with asan
-      runner: linux.2xlarge.memory
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.10-clang18-asan
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -140,8 +140,6 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      # More memory is needed to build with asan
-      runner: linux.2xlarge.memory
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.10-clang18-asan
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
--- a/.gitignore
+++ b/.gitignore
@ -82,7 +82,6 @@ torch/return_types.pyi
 torch/nn/functional.pyi
 torch/utils/data/datapipes/datapipe.pyi
 torch/csrc/autograd/generated/*
-torch/csrc/functionalization/generated/*
 torch/csrc/lazy/generated/*.[!m]*
 torch_compile_debug/
 # Listed manually because some files in this directory are not generated
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -22,7 +22,6 @@ COMMON_COPTS = [
    "-DHAVE_SHM_UNLINK=1",
    "-D_FILE_OFFSET_BITS=64",
    "-DUSE_FBGEMM",
-    "-DUSE_DISTRIBUTED",
    "-DAT_PER_OPERATOR_HEADERS",
    "-DATEN_THREADING=NATIVE",
    "-DNO_CUDNN_DESTROY_HANDLE",
@ -91,8 +90,6 @@ generated_cpu_cpp = [
    "aten/src/ATen/NativeMetaFunctions.h",
    "aten/src/ATen/RegistrationDeclarations.h",
    "aten/src/ATen/VmapGeneratedPlumbing.h",
-    "aten/src/ATen/ViewMetaClasses.h",
-    "aten/src/ATen/ViewMetaClasses.cpp",
    "aten/src/ATen/core/aten_interned_strings.h",
    "aten/src/ATen/core/enum_tag.h",
    "aten/src/ATen/core/TensorBody.h",
@ -813,7 +810,7 @@ cc_library(
    name = "torch_python",
    srcs = libtorch_python_core_sources
        + if_cuda(libtorch_python_cuda_sources)
-        + if_cuda(libtorch_python_distributed_sources)
+        + libtorch_python_distributed_sources
        + GENERATED_AUTOGRAD_PYTHON,
    hdrs = glob([
        "torch/csrc/generic/*.cpp",
@ -1077,7 +1074,6 @@ test_suite(
        "aten/src/ATen/templates/LazyNonNativeIr.h",
        "aten/src/ATen/templates/RegisterDispatchKey.cpp",
        "aten/src/ATen/templates/RegisterDispatchDefinitions.ini",
-        "aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp",
        "aten/src/ATen/native/native_functions.yaml",
        "aten/src/ATen/native/tags.yaml",
        "aten/src/ATen/native/ts_native_functions.yaml",
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -180,8 +180,9 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(ppc64le)")
  set(CPU_POWER ON)
 endif()

-# For non-supported platforms, turn USE_DISTRIBUTED off by default. It is not
-# tested and likely won't work without additional changes.
+# For non-supported platforms, turn USE_DISTRIBUTED off by default.
+# NB: USE_DISTRIBUTED simply disables the backend; distributed code
+# still gets built
 if(NOT LINUX AND NOT WIN32)
  set(USE_DISTRIBUTED
      OFF
@ -261,11 +262,11 @@ option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF)
 option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF)
 option(USE_NATIVE_ARCH "Use -march=native" OFF)
 cmake_dependent_option(USE_MPS "Use MPS for macOS build" ON "MPS_FOUND" OFF)
-option(USE_DISTRIBUTED "Use distributed" ON)
+option(USE_DISTRIBUTED "Enable default distributed backends" ON)
 cmake_dependent_option(USE_NCCL "Use NCCL" ON
                       "USE_DISTRIBUTED;USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_XCCL "Use XCCL" ON
-                       "USE_XPU;UNIX;NOT APPLE" OFF)
+                       "USE_DISTRIBUTED;USE_XPU;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(USE_RCCL "Use RCCL" ON USE_NCCL OFF)
 cmake_dependent_option(USE_RCCL "Use RCCL" ON "USE_NCCL;NOT WIN32" OFF)
 cmake_dependent_option(USE_STATIC_NCCL "Use static NCCL" OFF "USE_NCCL" OFF)
@ -437,11 +438,10 @@ if(WIN32)
      PATH_SUFFIXES lib
      NO_DEFAULT_PATH)
    if(NOT libuv_tmp_LIBRARY)
-      set(USE_DISTRIBUTED OFF)
      set(USE_GLOO OFF)
      message(
        WARNING
-          "Libuv is not installed in current conda env. Set USE_DISTRIBUTED to OFF. "
+          "Libuv is not installed in current conda env. Set USE_GLOO to OFF. "
          "Please run command 'conda install -c conda-forge libuv=1.39' to install libuv."
      )
    else()
--- a/aten/src/ATen/FunctionalStorageImpl.cpp
+++ b/aten/src/ATen/FunctionalStorageImpl.cpp
@ -9,6 +9,11 @@

 namespace at::functionalization {

+ViewMeta ViewMeta::to_out_idx(int64_t out_idx) {
+  if (out_idx == this->out_index) return *this;
+  return ViewMeta(forward_fn, reverse_fn, has_symbolic_inputs, is_multi_output, is_as_strided, out_idx);
+}
+
 // Note [Functionalization: Alias Removal Part 2]
 // See Note [Functionalization: Alias Removal] for more details.
 // This function applies a single update from one of the views to the StorageImpl.
@ -37,12 +42,12 @@ namespace at::functionalization {
 static const Tensor apply_update(const FunctionalStorageImpl::Update& update, const Tensor& base) {
  at::Tensor t = update.new_val;
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
-  if (update.view_metas.empty()) { return t; }
+  if (update.view_metas.empty()) return t;

  std::vector<at::Tensor> tmp_values({base});
  tmp_values.reserve(update.view_metas.size());
  for (size_t i = 0; i < update.view_metas.size() - 1; ++i) {
-    at::Tensor next_view = update.view_metas[i]->forward(tmp_values.back());
+    at::Tensor next_view = update.view_metas[i].forward_fn(tmp_values.back(), update.view_metas[i].out_index);
    // NB: We only actually need tmp_values for ops like select/slice/diagonal/squeeze/as_strided
    // All of these ops require additional information to recover the sizes of the original tensor.
    // If need to, we could probably apply this optimization and only bother computing tmp_values
@ -50,8 +55,9 @@ static const Tensor apply_update(const FunctionalStorageImpl::Update& update, co
    tmp_values.push_back(std::move(next_view));
  }
  for(int64_t i = static_cast<int64_t>(update.view_metas.size()) - 1; i >= 0; --i) {
+    int64_t out_idx = update.view_metas[i].out_index;
    // Each view inverse is implemented in ViewInverses.cpp.
-    t = update.view_metas[i]->reverse(tmp_values[i], t);
+    t = update.view_metas[i].reverse_fn(tmp_values[i], t, out_idx);
  }
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
  return t;
@ -105,13 +111,13 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(base_));
 }

-void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<std::shared_ptr<ViewMeta>>& metas) {
+void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<ViewMeta>& metas) {
  TORCH_CHECK(!frozen_, "cannot mutate tensors with frozen storage");

  if (metas.size() > 1) {
    for (size_t i = 1; i < metas.size(); ++i) {
      // Skipping this check for XLA. Would be good to add it back, but it is failing XLA CI
-      TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i]->is_as_strided,
+      TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i].is_as_strided,
 "During torch.compile, encountered a mutation on a view chain of length ", metas.size(), ", where view ", i,
 " was an as_strided() call. as_strided() is non-compositional, and therefore is not possible to functionalize properly today,"
 "so this behavior is banned in compile. As a workaround, you can either remove the mutation from the model code, or you "
--- a/aten/src/ATen/FunctionalStorageImpl.h
+++ b/aten/src/ATen/FunctionalStorageImpl.h
@ -8,89 +8,44 @@ namespace at::functionalization {

 // See Note [Functionalization Pass In Core]

-enum class InverseReturnMode {
-  /// Specifies that functional inverses should always return a view.
-  AlwaysView,
-  /// Specifies that functional inverses should always return a non-view / copy.
-  NeverView,
-  /// Specifies that functional inverses should return a view unless a (copying)
-  /// scatter
-  /// inverse exists, in which case that will be used instead.
-  /// This avoids as_strided() calls that can be difficult for subclasses to
-  /// handle.
-  ViewOrScatterInverse,
-};
-
-#define FUNCTIONALIZATION_VIEWMETA_NAME(TYPE) \
-  static const char* name() {                 \
-    return #TYPE;                             \
-  }
-
-#define FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(...) \
-  using SerializableTuple = std::tuple<__VA_ARGS__>
-
 // ViewMeta is a class used by the functionalization pass to navigate between
 // a base tensor and a view tensor.
 // For example, if I call `b = a.view1(...)`
-// the functionalization pass will generate and store a ViewMeta specialization
-// for `view1` operation on b that looks like:
+// the functionalization pass will generate and store a ViewMeta on b that looks
+// like:
 //
-// struct TORCH_API view1_ViewMeta : public ViewMeta {
-//   FUNCTIONALIZATION_VIEWMETA_NAME(view1_ViewMeta);
-//   FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
-//       bool /* reapply_views */,
-//       const std::vector<int64_t>&);
-//
-//   view1_ViewMeta(const SerializableTuple& tpl)
-//       : view1_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
-//
-//   view1_ViewMeta(bool reapply_views, const std::vector<int64_t>& size)
-//       : ViewMeta(/*has_symbolic_inputs=*/false),
-//         reapply_views(reapply_views),
-//         size(size) {}
-//
-//   Tensor forward(const Tensor& base) override {
-//       return base.view1(...);
+// ViewMeta(
+//   [<captures>](const Tensor& base, int64_t mutated_view_idx) {
+//     return base.view1(...);
+//   },
+//   [<captures>](const at::Tensor& base, const at::Tensor& mutated_view,
+//   int64_t mutated_view_idx) -> at::Tensor {
+//     return at::functionalization::impl::view1_inverse(base, mutated_view,
+//     ...);
 //   }
 //
-//   Tensor reverse(const Tensor& base, const Tensor& mutated_view) override {
-//       return at::functionalization::impl::view1_inverse(base, mutated_view,
-//       ...);
-//   }
+// The forward_fn lambda describes how to replay view1 on a tensor.
 //
-//   SerializableTuple to_serializable_tuple() {
-//     return std::make_tuple(reapply_views, size);
-//   }
-//
-//   bool reapply_views;
-//   std::vector<int64_t> size;
-// };
-//
-// The forward function describes how to replay view1 on a tensor.
-//
-// The reverse function describes how, given a tensor that is already a view,
+// The reverse_fn lambda describes how, given a tensor that is already a view,
 // how to get the corresponding base tensor. See Note [Functionalization Pass:
 // View Inverses] for details.
-//
-// `SerializedTuple` is a typedef that defines an `std::tuple<...>` type
-// representing the `ViewMeta` instance state. Methods that take in/return such
-// a type are used for supporting pickle serialization.
 struct ViewMeta {
  ViewMeta(
+      std::function<Tensor(const Tensor&, int64_t)> forward,
+      std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse,
      bool has_symbolic_inputs,
      bool is_multi_output = false,
      bool is_as_strided = false,
      int64_t out_idx = 0)
-      : out_index(out_idx),
+      : forward_fn(std::move(forward)),
+        reverse_fn(std::move(reverse)),
+        out_index(out_idx),
        is_multi_output(is_multi_output),
        is_as_strided(is_as_strided),
        has_symbolic_inputs(has_symbolic_inputs) {}

-  virtual ~ViewMeta() = default;
-
-  virtual Tensor forward(const Tensor& base) = 0;
-  virtual Tensor reverse(const Tensor& base, const Tensor& mutated_view) = 0;
-
+  std::function<Tensor(const Tensor&, int64_t)> forward_fn;
+  std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse_fn;
  // See Note [out_idx in ViewMeta]
  int64_t out_index;

@ -102,17 +57,10 @@ struct ViewMeta {
  // Tells us if this view operation has any symbolic inputs
  bool has_symbolic_inputs;

-  // Returns a new ViewMeta with the same forward/reverse
+  // Returns a copy of the current ViewMeta, if out_idx matches the current
+  // out_index. Otherwise, returns a new ViewMeta with the same forward/reverse
  // functions, but a new out index.
-  //
-  // This method should be implemented by those `ViewMeta` that have more than
-  // one output.
-  virtual std::shared_ptr<ViewMeta> to_out_index(int64_t out_index) {
-    TORCH_CHECK_NOT_IMPLEMENTED(
-        false,
-        "ViewMeta::to_out_index not implemented. ",
-        "Likely because there's only one output.");
-  }
+  ViewMeta to_out_idx(int64_t out_idx);
 };

 // FunctionalStorageImpl is a subclass of StorageImpl used by the
@ -145,14 +93,14 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
    const at::Tensor new_val;
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
-    const std::vector<std::shared_ptr<ViewMeta>> view_metas;
+    const std::vector<ViewMeta> view_metas;
  };

  explicit FunctionalStorageImpl(const Tensor& value);

  void add_update(
      const Tensor& updated_val,
-      const std::vector<std::shared_ptr<ViewMeta>>& view_metas);
+      const std::vector<ViewMeta>& view_metas);
  bool apply_updates();
  const Tensor& base() {
    return base_;
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@ -129,19 +129,17 @@ void FunctionalTensorWrapper::freeze_storage() const {
 // - view_value: The output tensor that we need to wrap.
 // - base: The "base" of the view that `view_value` was generated from.
 // See Note [Functionalization: Alias Removal Part 2] for more details on the mutation replay logic.
-FunctionalTensorWrapper::FunctionalTensorWrapper(
-    const Tensor& view_value,
-    const FunctionalTensorWrapper* base,
-    const std::shared_ptr<functionalization::ViewMeta>& meta)
-    : c10::TensorImpl(
-          c10::DispatchKeySet(DispatchKey::Functionalize),
-          view_value.dtype(),
-          base->storage().data_ptr().device()),
-      value_(view_value),
-      is_multi_output_view_(
-          base->is_multi_output_view_ || meta->is_multi_output),
-      was_storage_changed_(base->was_storage_changed_),
-      is_symbolic_(base->is_symbolic_) {
+FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const FunctionalTensorWrapper* base, const functionalization::ViewMeta& meta)
+  : c10::TensorImpl(
+      c10::DispatchKeySet(DispatchKey::Functionalize),
+      view_value.dtype(),
+      base->storage().data_ptr().device()
+    ),
+    value_(view_value),
+    is_multi_output_view_(base->is_multi_output_view_ || meta.is_multi_output),
+    was_storage_changed_(base->was_storage_changed_),
+    is_symbolic_(base->is_symbolic_)
+{
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(value_));
  TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize));
  set_constructor_metadata();
@ -150,10 +148,11 @@ FunctionalTensorWrapper::FunctionalTensorWrapper(
      view_metas_ = base->view_metas_;  // copy
  }
  view_metas_.push_back(meta);
-  maybe_mark_symbolic(meta.get());
+  maybe_mark_symbolic(meta);
  storage_ = base->storage_; // alias this tensor's storage with the base tensor's
 }

+
 functionalization::FunctionalStorageImpl* FunctionalTensorWrapper::functional_storage_impl() const {
  return static_cast<functionalization::FunctionalStorageImpl*>(storage_.unsafeGetStorageImpl());
 }
@ -177,18 +176,18 @@ bool FunctionalTensorWrapper::is_up_to_date() const {
 }

 // See Note [Functionalization Pass - Inplace View Ops]
-void FunctionalTensorWrapper::mutate_view_meta(const std::shared_ptr<at::functionalization::ViewMeta>& meta) {
+void FunctionalTensorWrapper::mutate_view_meta(const at::functionalization::ViewMeta& meta) {
  view_metas_.push_back(meta);
  // Manually track the fact that this tensor received a metadata mutation!
  has_metadata_mutation_ = true;
  // Mark this tensor as being symbolic if there are any symbolic inputs used by the view operation.
-  maybe_mark_symbolic(meta.get());
+  maybe_mark_symbolic(meta);
  // Note [Functionalization Pass - Inplace View Ops]
  // So, these ops are special - they're mutation AND view ops. They get special codegen.
  // An example is transpose_, e.g. `a.transpose_()`
  // Calling transpose_() should ensure that a gets an alias, and append the new ViewMeta to a's current list of ViewMetas.
  at::AutoDispatchSkipFunctionalize guard;
-  value_ = meta->forward(value_);
+  value_ = meta.forward_fn(value_, meta.out_index);
  TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize));
 }

@ -369,8 +368,15 @@ void FunctionalTensorWrapper::sync_() {
  regenerate_from_base();
 }

-const std::vector<std::shared_ptr<functionalization::ViewMeta>>& FunctionalTensorWrapper::view_metas() const {
-  return view_metas_;
+Tensor FunctionalTensorWrapper::apply_view_metas(const Tensor& base) {
+  auto t = base;
+
+  // Reapply views to get the viewed tensor from the base in alias_
+  for (auto& view_meta: view_metas_) {
+    t = view_meta.forward_fn(t, view_meta.out_index);
+  }
+
+  return t;
 }

 void FunctionalTensorWrapper::regenerate_from_base() {
@ -379,7 +385,7 @@ void FunctionalTensorWrapper::regenerate_from_base() {
  auto t = storage_impl->base();

  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
-  t = at::functionalization::impl::apply_view_meta_sequence(t, view_metas_);
+  t = apply_view_metas(t);
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));

  replace_(t, /*from_lazy_regenerate=*/true);
@ -721,11 +727,11 @@ bool isFunctionalTensor(const std::optional<Tensor>& t) {
 }

 bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {
-  if (t_list.empty()) { return false; }
+  if (t_list.empty()) return false;
  auto functional_count = 0;
  for (const auto i : c10::irange(t_list.size())) {
    auto const & e= t_list[i];
-    if (!e.has_value() || !e->defined()) { continue; }
+    if (!e.has_value() || !e->defined()) continue;
    if (isFunctionalTensor(e)) {
      ++functional_count;
    }
@ -735,10 +741,10 @@ bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {

 template <typename T>
 static bool isFunctionalTensorIListRef(c10::IListRef<T> list) {
-  if (list.size() == 0) { return false; }
+  if (list.size() == 0) return false;
  auto functional_count = 0;
  for (const auto& tensor : list) {
-    if (!tensor.defined()) { continue; }
+    if (!tensor.defined()) continue;
    if (isFunctionalTensor(tensor)) {
      ++functional_count;
    }
@ -756,28 +762,20 @@ void freeze_functional_tensor(const Tensor& tensor) {
  functional_base_impl->freeze_storage();
 }

-Tensor create_functional_tensor_with_view_meta(
-    const at::Tensor& view_to_wrap,
-    const at::Tensor& base,
-    const std::shared_ptr<functionalization::ViewMeta>& meta,
-    int64_t out_idx) {
+Tensor create_functional_tensor_with_view_meta(const at::Tensor& view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta, int64_t out_idx) {
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(view_to_wrap));
  TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(base));
  auto functional_base_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(base);
-  auto meta_ = meta;
  if (out_idx != 0) {
    // Note [out_idx in ViewMeta]
    // When a view op outputs multiple tensors, each output needs its own separate ViewMeta.
    // Each ViewMeta also tracks the index of the particular output tensor, which is needed in the reverse function.
-    meta_ = meta->to_out_index(out_idx);
+    meta = meta.to_out_idx(out_idx);
  }
-  return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta_);
+  return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta);
 }

-std::vector<Tensor> create_functional_tensor_with_view_meta(
-    ITensorListRef view_to_wrap,
-    const at::Tensor& base,
-    const std::shared_ptr<functionalization::ViewMeta>& meta) {
+std::vector<Tensor> create_functional_tensor_with_view_meta(ITensorListRef view_to_wrap, const at::Tensor& base, const functionalization::ViewMeta& meta) {
  std::vector<Tensor> outputs(view_to_wrap.size());
  int64_t i = 0;
  for (const auto& tensor : view_to_wrap) {
@ -787,22 +785,12 @@ std::vector<Tensor> create_functional_tensor_with_view_meta(
  return outputs;
 }

-void mutate_view_meta(const at::Tensor& self, const std::shared_ptr<functionalization::ViewMeta>& meta) {
+void mutate_view_meta(const at::Tensor& self, const functionalization::ViewMeta& meta) {
  TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(self));
  auto self_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(self);
  self_impl->mutate_view_meta(meta);
 }

-Tensor apply_view_meta_sequence(
-    const Tensor& base,
-    const std::vector<std::shared_ptr<functionalization::ViewMeta>>& sequence) {
-  Tensor r = base;
-  for (auto& vm : sequence) {
-    r = vm->forward(r);
-  }
-  return r;
-}
-
 // Note [Propagating strides in the functionalization pass]
 // In order to properly compute stride information, the functionalization pass
 // calls each {view} reference implementations with meta tensors.
@ -896,7 +884,7 @@ void functionalize_op_helper(const c10::OperatorHandle& op, torch::jit::Stack* s
    const auto& ivalue = returns[idx];
    if (ivalue.isTensor()) {
      const auto& t = ivalue.toTensor();
-      if (!t.defined()) { continue; }
+      if (!t.defined()) continue;
      at::functionalization::impl::sync(t);
      auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(t));
      (*stack)[returns_begin + idx] = t_new;
--- a/aten/src/ATen/FunctionalTensorWrapper.h
+++ b/aten/src/ATen/FunctionalTensorWrapper.h
@ -56,7 +56,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
  explicit FunctionalTensorWrapper(
      const Tensor& view_value,
      const FunctionalTensorWrapper* base,
-      const std::shared_ptr<functionalization::ViewMeta>& meta);
+      const functionalization::ViewMeta& meta);

  // Get the underlying, actual tensor, that doesn't know anything about
  // functionalization.
@ -99,17 +99,17 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
        ->are_all_mutations_under_no_grad_or_inference_mode();
  }

-  void maybe_mark_symbolic(functionalization::ViewMeta* meta) {
-    is_symbolic_ = is_symbolic_ | meta->has_symbolic_inputs;
+  void maybe_mark_symbolic(const functionalization::ViewMeta& meta) {
+    is_symbolic_ = is_symbolic_ | meta.has_symbolic_inputs;
  }

  bool is_symbolic() const {
    return is_symbolic_;
  }

-  // Retrieves the ViewMeta sequence of this tensor.
-  const std::vector<std::shared_ptr<functionalization::ViewMeta>>& view_metas()
-      const;
+  // Runs the forward_fn of every ViewMeta collected in the current instance
+  // to some other base.
+  Tensor apply_view_metas(const Tensor& base);

  // Sync's the underlying tensor with its alias, if it's out of date. This
  // involves two steps: 1) Apply any pending updates/mutations to the alias 2)
@ -146,8 +146,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
  // from the base tensor. This method is used by inplace-view ops like
  // transpose_. It appends a ViewMeta to the existing stack, and refreshes the
  // tensor by replaying the views off of the alias.
-  void mutate_view_meta(
-      const std::shared_ptr<at::functionalization::ViewMeta>& meta);
+  void mutate_view_meta(const at::functionalization::ViewMeta& meta);

  // Custom implementation of self.set_(src)
  void set__impl(const FunctionalTensorWrapper* other);
@ -286,7 +285,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
  bool is_symbolic_ = false;

  size_t generation_ = 0;
-  std::vector<std::shared_ptr<at::functionalization::ViewMeta>> view_metas_;
+  std::vector<at::functionalization::ViewMeta> view_metas_;

 protected:
  static void copy_tensor_metadata(
@ -378,20 +377,16 @@ TORCH_API void propagate_xla_data_direct(
 Tensor create_functional_tensor_with_view_meta(
    const Tensor& view_to_wrap,
    const Tensor& base,
-    const std::shared_ptr<functionalization::ViewMeta>& meta,
+    functionalization::ViewMeta meta,
    int64_t out_idx = 0);
 std::vector<Tensor> create_functional_tensor_with_view_meta(
    ITensorListRef view_to_wrap,
    const Tensor& base,
-    const std::shared_ptr<functionalization::ViewMeta>& meta);
+    const functionalization::ViewMeta& meta);

 void mutate_view_meta(
    const Tensor& self,
-    const std::shared_ptr<functionalization::ViewMeta>& meta);
-
-TORCH_API Tensor apply_view_meta_sequence(
-    const Tensor& base,
-    const std::vector<std::shared_ptr<functionalization::ViewMeta>>& sequence);
+    const functionalization::ViewMeta& meta);

 void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out);
 void set_sizes_strides_offset(
--- a/aten/src/ATen/FunctionalizeFallbackKernel.cpp
+++ b/aten/src/ATen/FunctionalizeFallbackKernel.cpp
@ -1,5 +1,3 @@
-#include <ATen/FunctionalizeFallbackKernel.h>
-
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <ATen/core/LegacyTypeDispatch.h>
 #include <ATen/EmptyTensor.h>
@ -9,6 +7,7 @@
 #include <torch/library.h>
 #include <c10/util/irange.h>
 #include <c10/util/strides.h>
+#include <ATen/EmptyTensor.h>

 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/ATen.h>
@ -29,31 +28,6 @@
 #include <utility>
 #endif

-namespace at::functionalization {
-
-Tensor resize__ViewMeta::forward(const Tensor& base) {
-  if (reapply_views) {
-    return base.as_strided(size, c10::contiguous_strides(size));
-  } else {
-    return at::as_strided_copy(base, size, c10::contiguous_strides(size));
-  }
-}
-
-Tensor resize__ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) {
-  return base.as_strided_scatter(
-      mutated_view, size, c10::contiguous_strides(size));
-}
-
-Tensor _unsafe_view_ViewMeta::forward(const Tensor& base) {
-  return at::_unsafe_view_symint(base, size);
-}
-
-Tensor _unsafe_view_ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) {
-  return at::_unsafe_view_symint(mutated_view, base.sym_sizes());
-}
-
-} // namespace at::functionalization
-
 namespace {
  void functionalizeFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatchKeySet [[maybe_unused]], torch::jit::Stack* stack) {
    const auto& schema = op.schema();
@ -132,9 +106,7 @@ namespace {
      const auto& ivalue = returns[idx];
      if (ivalue.isTensor() && should_wrap_outputs) {
        const auto& t = ivalue.toTensor();
-        if (!t.defined()) {
-          continue;
-        }
+        if (!t.defined()) continue;
        auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(t));
        (*stack)[returns_begin + idx] = t_new;
      } else if (ivalue.isTensorList() && should_wrap_outputs) {
@ -197,8 +169,19 @@ static const at::Tensor & resize__functionalization(c10::DispatchKeySet dispatch
  // The output of resizing is equivalent to taking a slice of a larger tensor.
  // We have to emulate this "slicing" with an as_strided call.
  auto reapply_views = at::functionalization::impl::getFunctionalizationReapplyViewsTLS();
-  auto view_meta = std::make_shared<at::functionalization::resize__ViewMeta>(
-      reapply_views, size.vec());
+  at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
+    [reapply_views = reapply_views, size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
+      if (reapply_views) {
+        return base.as_strided(size, c10::contiguous_strides(size));
+      } else {
+        return at::as_strided_copy(base, size, c10::contiguous_strides(size));
+      }
+    },
+    [size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
+      return base.as_strided_scatter(mutated_view, size, c10::contiguous_strides(size));
+    },
+    /*has_symbolic_inputs=*/false
+  );
  at::functionalization::impl::mutate_view_meta(self, view_meta);
  return self;
 }
@ -317,11 +300,17 @@ static at::Tensor _unsafe_view_functionalize(const at::Tensor & self, at::SymInt
    tmp_output = at::_unsafe_view_symint(self_, size);
  }

-  bool has_symbolic_inputs = std::any_of(
-      size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); });
-  auto view_meta =
-      std::make_shared<at::functionalization::_unsafe_view_ViewMeta>(
-          has_symbolic_inputs, size.vec());
+  bool has_symbolic_inputs = std::any_of(size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); });
+
+  at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
+    [size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
+      return at::_unsafe_view_symint(base, size);
+    },
+    [size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
+      return at::_unsafe_view_symint(mutated_view, base.sym_sizes());
+    },
+    /*has_symbolic_inputs=*/has_symbolic_inputs
+  );

  auto out = at::functionalization::impl::create_functional_tensor_with_view_meta(tmp_output, self, std::move(view_meta));
  // See  Note [Propagating strides in the functionalization pass]
--- a/aten/src/ATen/FunctionalizeFallbackKernel.h
+++ b/aten/src/ATen/FunctionalizeFallbackKernel.h
@ -1,58 +0,0 @@
-#pragma once
-
-#include <ATen/FunctionalStorageImpl.h>
-
-namespace at::functionalization {
-
-// `ViewMeta` implementation for `resize_` operation.
-struct TORCH_API resize__ViewMeta : public ViewMeta {
-  FUNCTIONALIZATION_VIEWMETA_NAME(resize__ViewMeta)
-  FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
-      bool /* reapply_views */,
-      const std::vector<int64_t>&);
-
-  resize__ViewMeta(const SerializableTuple& tpl)
-      : resize__ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
-
-  resize__ViewMeta(bool reapply_views, const std::vector<int64_t>& size)
-      : ViewMeta(/*has_symbolic_inputs=*/false),
-        reapply_views(reapply_views),
-        size(size) {}
-
-  Tensor forward(const Tensor& base) override;
-  Tensor reverse(const Tensor& base, const Tensor& mutated_view) override;
-
-  SerializableTuple to_serializable_tuple() {
-    return std::make_tuple(reapply_views, size);
-  }
-
-  bool reapply_views;
-  std::vector<int64_t> size;
-};
-
-// `ViewMeta` implementation for `_unsafe_view` operation.
-struct TORCH_API _unsafe_view_ViewMeta : public ViewMeta {
-  FUNCTIONALIZATION_VIEWMETA_NAME(_unsafe_view_ViewMeta)
-  FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
-      bool /* has_symbolic_inputs */,
-      const std::vector<c10::SymInt>&);
-
-  _unsafe_view_ViewMeta(const SerializableTuple& tpl)
-      : _unsafe_view_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
-
-  _unsafe_view_ViewMeta(
-      bool has_symbolic_inputs,
-      const std::vector<c10::SymInt>& size)
-      : ViewMeta(has_symbolic_inputs), size(size) {}
-
-  Tensor forward(const Tensor& base) override;
-  Tensor reverse(const Tensor& base, const Tensor& mutated_view) override;
-
-  SerializableTuple to_serializable_tuple() {
-    return std::make_tuple(has_symbolic_inputs, size);
-  }
-
-  std::vector<c10::SymInt> size;
-};
-
-} // namespace at::functionalization
--- a/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
+++ b/aten/src/ATen/native/sparse/cuda/cuSPARSELtOps.cpp
@ -120,7 +120,7 @@ at::Tensor _cslt_compress(const Tensor& sparse_input) {
  // buffer (in bytes)
  size_t orig_m = sparse_input.size(0);
  size_t div = orig_m * sparse_input.itemsize();
-  size_t new_n = (compressed_size + div - 1) / div; // ceil(s,d) = (s+d-1)/d
+  size_t new_n = (compressed_size + div - 1) / div; // floor
  auto compressed_tensor = sparse_input.new_empty({(int64_t)orig_m, (int64_t)new_n});

  auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
@ -155,7 +155,7 @@ std::tuple<at::Tensor, int64_t, int64_t, int64_t, int64_t> _cslt_sparse_mm_impl(
    TORCH_CUDASPARSE_CHECK(cusparseLtInit(&handle));
    handle_initialized = true;
  }
-  // cuSPARSELt constructs
+  // cupsarselt constructs
  cusparseLtMatmulDescriptor_t matmul;
  cusparseLtMatmulPlan_t plan;
  cusparseLtMatmulAlgSelection_t alg_sel;
--- a/aten/src/ATen/templates/FunctionalInverses.h
+++ b/aten/src/ATen/templates/FunctionalInverses.h
@ -2,12 +2,22 @@

 // ${generated_comment}

-#include <ATen/FunctionalStorageImpl.h>
 #include <ATen/Tensor.h>

 namespace at {
 namespace functionalization {

+enum class InverseReturnMode {
+  /// Specifies that functional inverses should always return a view.
+  AlwaysView,
+  /// Specifies that functional inverses should always return a non-view / copy.
+  NeverView,
+  /// Specifies that functional inverses should return a view unless a (copying) scatter
+  /// inverse exists, in which case that will be used instead.
+  /// This avoids as_strided() calls that can be difficult for subclasses to handle.
+  ViewOrScatterInverse,
+};
+
 struct FunctionalInverses {

 ${view_inverse_declarations}
--- a/aten/src/ATen/templates/RegisterFunctionalization.cpp
+++ b/aten/src/ATen/templates/RegisterFunctionalization.cpp
@ -4,7 +4,7 @@
 #include <ATen/core/LegacyTypeDispatch.h>
 #include <ATen/EmptyTensor.h>
 #include <ATen/FunctionalTensorWrapper.h>
-#include <ATen/ViewMetaClasses.h>
+#include <ATen/FunctionalInverses.h>
 #include <ATen/MemoryOverlap.h>
 #include <torch/library.h>

--- a/aten/src/ATen/templates/ViewMetaClasses.cpp
+++ b/aten/src/ATen/templates/ViewMetaClasses.cpp
@ -1,19 +0,0 @@
-// ${generated_comment}
-
-#include <ATen/FunctionalInverses.h>
-#include <ATen/ViewMetaClasses.h>
-
-#ifndef AT_PER_OPERATOR_HEADERS
-#include <ATen/Operators.h>
-#include <ATen/NativeFunctions.h>
-#else
-${op_headers}
-#endif
-
-namespace at {
-namespace functionalization {
-
-${view_meta_implementations}
-
-} // namespace functionalization
-} // namespace at
--- a/aten/src/ATen/templates/ViewMetaClasses.h
+++ b/aten/src/ATen/templates/ViewMetaClasses.h
@ -1,12 +0,0 @@
-#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
-// ${generated_comment}
-
-#include <ATen/FunctionalStorageImpl.h>
-
-namespace at {
-namespace functionalization {
-
-${view_meta_declarations}
-
-} // namespace functionalization
-} // namespace at
--- a/aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp
+++ b/aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp
@ -1,11 +0,0 @@
-#include <ATen/ViewMetaClasses.h>
-#include <torch/csrc/functionalization/Module.h>
-
-namespace torch::functionalization {
-
-void initGenerated(PyObject* module) {
-  auto functionalization = py::handle(module).cast<py::module>();
-  $view_meta_bindings
-}
-
-} // namespace torch::functionalization
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@ -156,7 +156,7 @@ ROOT = "//" if IS_OSS else "//xplat/caffe2"
 # for targets in subfolders
 ROOT_PATH = "//" if IS_OSS else "//xplat/caffe2/"

-C10 = "//c10:c10" if IS_OSS else "//xplat/caffe2/c10:c10"
+C10 = "//c10:c10" if IS_OSS else ("//xplat/caffe2/c10:c10_ovrsource" if is_arvr_mode() else "//xplat/caffe2/c10:c10")

 # a dictionary maps third party library name to fbsource and oss target
 THIRD_PARTY_LIBS = {
@ -391,8 +391,6 @@ def get_aten_generated_files(enabled_backends):
        "CompositeExplicitAutogradFunctions_inl.h",
        "CompositeExplicitAutogradNonFunctionalFunctions.h",
        "CompositeExplicitAutogradNonFunctionalFunctions_inl.h",
-        "ViewMetaClasses.h",
-        "ViewMetaClasses.cpp",
        "VmapGeneratedPlumbing.h",
        "core/ATenOpList.cpp",
        "core/TensorBody.h",
@ -950,6 +948,7 @@ def define_buck_targets(
            [
                ("torch/csrc/api/include", "torch/**/*.h"),
                ("", "torch/csrc/**/*.h"),
+                ("", "torch/csrc/**/*.hpp"),
                ("", "torch/nativert/**/*.h"),
                ("", "torch/headeronly/**/*.h"),
                ("", "torch/script.h"),
@ -1194,7 +1193,6 @@ def define_buck_targets(
            "NativeMetaFunctions.h": ":gen_aten[NativeMetaFunctions.h]",
            "Operators.h": ":gen_aten[Operators.h]",
            "RedispatchFunctions.h": ":gen_aten[RedispatchFunctions.h]",
-            "ViewMetaClasses.h": ":gen_aten[ViewMetaClasses.h]",
            "core/TensorBody.h": ":gen_aten[core/TensorBody.h]",
            "core/aten_interned_strings.h": ":gen_aten[core/aten_interned_strings.h]",
            "core/enum_tag.h": ":gen_aten[core/enum_tag.h]",
@ -2050,6 +2048,7 @@ def define_buck_targets(
                ("", "caffe2/utils/*.h"),
                ("", "caffe2/core/*.h"),
                ("", "torch/csrc/*.h"),
+                ("", "torch/csrc/*.hpp"),
                ("", "torch/csrc/api/include/torch/*.h"),
                ("", "torch/csrc/autograd/*.h"),
                ("", "torch/csrc/autograd/*/*.h"),
--- a/build.bzl
+++ b/build.bzl
@ -118,9 +118,6 @@ def define_targets(rules):
            ":LazyNonNativeIr.h",
            ":RegisterDispatchDefinitions.ini",
            ":RegisterDispatchKey.cpp",
-            ":ViewMetaClassesPythonBinding.cpp",
-            ":ViewMetaClasses.cpp",
-            ":ViewMetaClasses.h",
            ":native_functions.yaml",
            ":shape_inference.h",
            ":tags.yaml",
@ -173,7 +170,6 @@ GENERATED_H = [
    "FunctionalInverses.h",
    "RedispatchFunctions.h",
    "RegistrationDeclarations.h",
-    "ViewMetaClasses.h",
    "VmapGeneratedPlumbing.h",
 ]

@ -250,7 +246,6 @@ GENERATED_CPP = [
    "RegisterFunctionalization_1.cpp",
    "RegisterFunctionalization_2.cpp",
    "RegisterFunctionalization_3.cpp",
-    "ViewMetaClasses.cpp",
 ]

 GENERATED_CPP_CORE = [
@ -312,7 +307,6 @@ _GENERATED_AUTOGRAD_PYTHON_CPP = [
    "torch/csrc/autograd/generated/python_torch_functions_1.cpp",
    "torch/csrc/autograd/generated/python_torch_functions_2.cpp",
    "torch/csrc/autograd/generated/python_variable_methods.cpp",
-    "torch/csrc/functionalization/generated/ViewMetaClassesPythonBinding.cpp"
 ]

 GENERATED_AUTOGRAD_PYTHON = _GENERATED_AUTOGRAD_PYTHON_HEADERS + _GENERATED_AUTOGRAD_PYTHON_CPP
--- a/build_variables.bzl
+++ b/build_variables.bzl
@ -1010,7 +1010,6 @@ libtorch_python_core_sources = [
    "torch/csrc/utils/disable_torch_function.cpp",
    "torch/csrc/utils/verbose.cpp",
    "torch/csrc/cpu/Module.cpp",
-    "torch/csrc/functionalization/Module.cpp",
    "torch/csrc/instruction_counter/Module.cpp",
    "torch/nativert/python/Bindings.cpp",
 ] + lazy_tensor_core_python_sources
@ -1053,7 +1052,6 @@ def glob_libtorch_python_sources(gencode_pattern = ":generate-code[{}]"):
        "torch/csrc/autograd/generated/python_torch_functions_1.cpp",
        "torch/csrc/autograd/generated/python_torch_functions_2.cpp",
        "torch/csrc/autograd/generated/python_variable_methods.cpp",
-        "torch/csrc/functionalization/generated/ViewMetaClassesPythonBinding.cpp",
    ]]

    _libtorch_python_sources.extend(libtorch_python_core_sources)
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@ -3244,7 +3244,7 @@ class C10_TensorImpl_Size_Check_Dummy_Class : private TensorImpl {
    are_equal<sizeof(autograd_meta_),      4,  FieldNameEnum::autograd_meta_>();
    are_equal<sizeof(extra_meta_),         4,  FieldNameEnum::extra_meta_>();
    are_equal<sizeof(version_counter_),    4,  FieldNameEnum::version_counter_>();
-    are_equal<sizeof(pyobj_slot_),         4,  FieldNameEnum::pyobj_slot_>();
+    are_equal<sizeof(pyobj_slot_),    8,  FieldNameEnum::pyobj_slot_>();
    is_le<sizeof(sizes_and_strides_),     88, FieldNameEnum::sizes_and_strides_>();
    are_equal<sizeof(storage_offset_),     8,  FieldNameEnum::storage_offset_>();
    are_equal<sizeof(numel_),              8,  FieldNameEnum::numel_>();
@ -3269,7 +3269,7 @@ class C10_TensorImpl_Size_Check_Dummy_Class : private TensorImpl {
    is_le<sizeof(autograd_meta_),         16,  FieldNameEnum::autograd_meta_>();
    is_le<sizeof(extra_meta_),            16,  FieldNameEnum::extra_meta_>();
    are_equal<sizeof(version_counter_),    8,  FieldNameEnum::version_counter_>();
-    are_equal<sizeof(pyobj_slot_),         8,  FieldNameEnum::pyobj_slot_>();
+    are_equal<sizeof(pyobj_slot_),   16,  FieldNameEnum::pyobj_slot_>();
    are_equal<sizeof(sizes_and_strides_), 88,  FieldNameEnum::sizes_and_strides_>();
    are_equal<sizeof(storage_offset_),     8,  FieldNameEnum::storage_offset_>();
    are_equal<sizeof(numel_),              8,  FieldNameEnum::numel_>();
--- a/c10/core/impl/PyInterpreterHooks.h
+++ b/c10/core/impl/PyInterpreterHooks.h
@ -13,10 +13,11 @@ struct C10_API PyInterpreterHooksInterface {

  // Get the PyInterpreter instance
  // Stub implementation throws error when Python is not available
-  // We return nullptr rather than throwing an error since there are bits of c10
-  // that expect an empty PyObjectSlot when python is not available.
  virtual PyInterpreter* getPyInterpreter() const {
-    return nullptr;
+    TORCH_CHECK(
+        false,
+        "PyTorch was compiled without Python support. "
+        "Cannot access Python interpreter from C++.");
  }
 };

--- a/c10/core/impl/PyObjectSlot.cpp
+++ b/c10/core/impl/PyObjectSlot.cpp
@ -2,7 +2,7 @@

 namespace c10::impl {

-PyObjectSlot::PyObjectSlot() : pyobj_(nullptr) {}
+PyObjectSlot::PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}

 PyObjectSlot::~PyObjectSlot() {
  maybe_destroy_pyobj();
@ -10,9 +10,9 @@ PyObjectSlot::~PyObjectSlot() {

 void PyObjectSlot::maybe_destroy_pyobj() {
  if (owns_pyobj()) {
-    TORCH_INTERNAL_ASSERT(getGlobalPyInterpreter() != nullptr);
+    TORCH_INTERNAL_ASSERT(pyobj_interpreter_ != nullptr);
    TORCH_INTERNAL_ASSERT(pyobj_ != nullptr);
-    (*getGlobalPyInterpreter())
+    (*pyobj_interpreter_.load(std::memory_order_acquire))
        ->decref(_unchecked_untagged_pyobj(), /*has_pyobj_slot*/ true);
    // NB: this destructor can only be entered when there are no
    // references to this C++ object (obviously), NOR any references
@ -25,7 +25,7 @@ void PyObjectSlot::maybe_destroy_pyobj() {
 }

 PyInterpreter* PyObjectSlot::pyobj_interpreter() {
-  return getGlobalPyInterpreter();
+  return pyobj_interpreter_.load(std::memory_order_acquire);
 }

 PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
@ -35,7 +35,7 @@ PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
 }

 PyInterpreter& PyObjectSlot::load_pyobj_interpreter() const {
-  auto interpreter = getGlobalPyInterpreter();
+  auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
  if (interpreter) {
    return *interpreter;
  }
--- a/c10/core/impl/PyObjectSlot.h
+++ b/c10/core/impl/PyObjectSlot.h
@ -6,17 +6,10 @@
 #include <c10/util/python_stub.h>
 #include <optional>

+#include <atomic>
+
 namespace c10::impl {

-// Function pointer type for getting the global interpreter
-using GetPyInterpreterFn = PyInterpreter* (*)();
-
-// Global function pointer (set by csrc initialization)
-C10_API extern GetPyInterpreterFn g_get_pyinterpreter_fn;
-
-// Helper function to get the global interpreter
-C10_API PyInterpreter* getGlobalPyInterpreter();
-
 struct C10_API PyObjectSlot {
 public:
  PyObjectSlot();
@ -33,6 +26,8 @@ struct C10_API PyObjectSlot {
  // NB: THIS FUNCTION CAN RAISE AN EXCEPTION.  Make sure to clean up after
  // PyObject if necessary!
  void init_pyobj(PyObject* pyobj) {
+    pyobj_interpreter_.store(
+        getGlobalPyInterpreter(), std::memory_order_relaxed);
    pyobj_ = pyobj;
  }

@ -60,15 +55,18 @@ struct C10_API PyObjectSlot {

  // @todo alban: I'm not too sure what's going on here, we can probably delete
  // it but it's worthwhile making sure
-  std::optional<PyObject*> check_pyobj() const {
-    impl::PyInterpreter* interpreter = getGlobalPyInterpreter();
-    if (interpreter == nullptr || pyobj_ == nullptr) {
+  std::optional<PyObject*> check_pyobj(bool ignore_hermetic_tls = false) const {
+    impl::PyInterpreter* interpreter =
+        pyobj_interpreter_.load(std::memory_order_acquire);
+    if (interpreter == nullptr) {
      return std::nullopt;
    }
-    if (c10::impl::HermeticPyObjectTLS::get_state()) {
+
+    if (!ignore_hermetic_tls && c10::impl::HermeticPyObjectTLS::get_state()) {
      return std::nullopt;
+    } else {
+      return _unchecked_untagged_pyobj();
    }
-    return _unchecked_untagged_pyobj();
  }

  PyInterpreter& load_pyobj_interpreter() const;
@ -78,6 +76,30 @@ struct C10_API PyObjectSlot {
  void set_owns_pyobj(bool b);

 private:
+  // This field contains the interpreter tag for this object.  See
+  // Note [Python interpreter tag] for general context
+  //
+  // Note [Memory ordering on Python interpreter tag]
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // What memory_order do we need when accessing this atomic?  We don't
+  // need a single total modification order (as provided by
+  // memory_order_seq_cst) as pyobj_interpreter_ is monotonic: it can only
+  // transition from -1 to some positive integer and never changes afterwards.
+  // Because there is only one modification, it trivially already has a total
+  // modification order (e.g., we don't need fences or locked instructions on
+  // x86)
+  //
+  // In fact, one could make a reasonable argument that relaxed reads are OK,
+  // due to the presence of external locking (GIL) to ensure that interactions
+  // with other data structures are still correctly synchronized, so that
+  // we fall in the "Single-Location Data Structures" case as described in
+  // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf
+  // However, on x86, it doesn't matter if I use acquire or relaxed on the load
+  // as I get the same assembly in both cases.  So I just use the more
+  // conservative acquire (which will impede compiler optimizations but I don't
+  // care)
+  std::atomic<PyInterpreter*> pyobj_interpreter_;
+
  // This field contains a reference to a PyObject representing this Tensor.
  // If pyobj is nullptr, when we transfer Tensor to Python, we allocate a new
  // PyObject for it and set this field.  This field does not have to be
--- a/c10/ovrsource_defs.bzl
+++ b/c10/ovrsource_defs.bzl
@ -18,9 +18,9 @@ cuda_supported_platforms = [

 def define_c10_ovrsource(name, is_mobile):
    if is_mobile:
-        pp_flags = ["-DC10_MOBILE=1"]
+        pp_flags = ["-DC10_MOBILE=1", "-DC10_USE_GLOG"]
    else:
-        pp_flags = []
+        pp_flags = ["-DC10_USE_GLOG"]

    oxx_static_library(
        name = name,
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -316,7 +316,6 @@ set(GENERATED_CXX_PYTHON
  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_special_functions.cpp"
  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_return_types.cpp"
  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_enum_tag.cpp"
-  "${TORCH_SRC_DIR}/csrc/functionalization/generated/ViewMetaClassesPythonBinding.cpp"
  )

 set(GENERATED_H_PYTHON
@ -380,9 +379,6 @@ add_custom_command(
    "${TORCH_ROOT}/aten/src/ATen/templates/LazyIr.h"
    "${TORCH_ROOT}/aten/src/ATen/templates/LazyNonNativeIr.h"
    "${TORCH_ROOT}/aten/src/ATen/templates/RegisterDispatchKey.cpp"
-    "${TORCH_ROOT}/aten/src/ATen/templates/ViewMetaClasses.h"
-    "${TORCH_ROOT}/aten/src/ATen/templates/ViewMetaClasses.cpp"
-    "${TORCH_ROOT}/aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp"
    ${autograd_python}
    ${autograd_yaml}
    ${autograd_templates}
@ -544,11 +540,9 @@ if(NOT INTERN_BUILD_MOBILE AND NOT BUILD_LITE_INTERPRETER)
    ${TORCH_SRC_DIR}/csrc/utils/byte_order.cpp
  )

-  if(USE_DISTRIBUTED)
-    append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
-    if(NOT WIN32)
-      append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
-    endif()
+  append_filelist("libtorch_distributed_base_sources" TORCH_SRCS)
+  if(NOT WIN32)
+    append_filelist("libtorch_distributed_extra_sources" TORCH_SRCS)
  endif()
 endif()

@ -579,32 +573,30 @@ if(USE_CUDA)
    list(APPEND Caffe2_GPU_SRCS
      ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
  endif()
-  if(USE_DISTRIBUTED)
-    append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_GPU_SRCS)
-    if(NOT WIN32)
-      append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
-      set_source_files_properties(
-        ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
-        ${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
-        ${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
-        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp
-        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
-        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
-        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
-        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
-        ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
-        PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
-      )
-    endif()
+  append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_GPU_SRCS)
+  if(NOT WIN32)
+    append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_GPU_SRCS)
+    set_source_files_properties(
+      ${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/utils.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/c10d/intra_node_comm.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CudaDMAConnectivity.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
+      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
+      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/NCCLSymmetricMemory.cu
+      ${TORCH_SRC_DIR}/csrc/distributed/c10d/symm_mem/cuda_mem_pool.cpp
+      PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1"
+    )
+  endif()

-    set(ASYNC_MM_FILE "${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/AsyncMM.cu")
-    # Disable the warning to make cutlass warp-specialized cooperative kernel build for gcc-9
-    if(CMAKE_COMPILER_IS_GNUCXX)
-      set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-Wno-unused-but-set-variable")
-    endif()
-    if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0 AND CUDA_NVCC_FLAGS MATCHES ".*compute_90.*")
-      set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
-    endif()
+  set(ASYNC_MM_FILE "${TORCH_SRC_DIR}/csrc/distributed/c10d/cuda/AsyncMM.cu")
+  # Disable the warning to make cutlass warp-specialized cooperative kernel build for gcc-9
+  if(CMAKE_COMPILER_IS_GNUCXX)
+    set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-Wno-unused-but-set-variable")
+  endif()
+  if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0 AND CUDA_NVCC_FLAGS MATCHES ".*compute_90.*")
+    set_source_files_properties(${ASYNC_MM_FILE} PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a")
  endif()
  set_source_files_properties(
    ${TORCH_ROOT}/aten/src/ATen/cuda/detail/LazyNVRTC.cpp
@ -637,11 +629,9 @@ if(USE_ROCM)
    list(APPEND Caffe2_HIP_SRCS
      ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
  endif()
-  if(USE_DISTRIBUTED)
-    append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
-    if(NOT WIN32)
-      append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
-    endif()
+  append_filelist("libtorch_cuda_distributed_base_sources" Caffe2_HIP_SRCS)
+  if(NOT WIN32)
+    append_filelist("libtorch_cuda_distributed_extra_sources" Caffe2_HIP_SRCS)
  endif()
  # caffe2_nvrtc's stubs to driver APIs are useful for HIP.
  # See NOTE [ ATen NVRTC Stub and HIP ]
@ -1362,12 +1352,10 @@ if(BUILD_TEST)
    add_subdirectory(${TORCH_ROOT}/test/cpp/jit ${CMAKE_BINARY_DIR}/test_jit)
    add_subdirectory(${TORCH_ROOT}/test/cpp/nativert ${CMAKE_BINARY_DIR}/test_nativert)
    add_subdirectory(${TORCH_ROOT}/test/inductor ${CMAKE_BINARY_DIR}/test_inductor)
-    if(USE_DISTRIBUTED)
-      add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
-      if(NOT WIN32)
-        add_subdirectory(${TORCH_ROOT}/test/cpp/dist_autograd ${CMAKE_BINARY_DIR}/dist_autograd)
-        add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
-      endif()
+    add_subdirectory(${TORCH_ROOT}/test/cpp/c10d ${CMAKE_BINARY_DIR}/test_cpp_c10d)
+    if(NOT WIN32)
+      add_subdirectory(${TORCH_ROOT}/test/cpp/dist_autograd ${CMAKE_BINARY_DIR}/dist_autograd)
+      add_subdirectory(${TORCH_ROOT}/test/cpp/rpc ${CMAKE_BINARY_DIR}/test_cpp_rpc)
    endif()
    if(NOT NO_API)
      add_subdirectory(${TORCH_ROOT}/test/cpp/api ${CMAKE_BINARY_DIR}/test_api)
@ -1472,46 +1460,40 @@ if(BUILD_LITE_INTERPRETER)
  endif()
 endif()

-
-# Pass USE_DISTRIBUTED to torch_cpu, as some codes in jit/pickler.cpp and
-# jit/unpickler.cpp need to be compiled only when USE_DISTRIBUTED is set
-if(USE_DISTRIBUTED)
-  target_compile_definitions(torch_cpu PUBLIC USE_DISTRIBUTED)
-  if(USE_GLOO AND USE_C10D_GLOO)
-    target_compile_definitions(torch_cpu PUBLIC USE_C10D_GLOO)
+if(USE_GLOO AND USE_C10D_GLOO)
+  target_compile_definitions(torch_cpu PUBLIC USE_C10D_GLOO)
+endif()
+if(USE_UCC AND USE_C10D_UCC)
+  target_compile_definitions(torch_cpu PUBLIC USE_C10D_UCC)
+  if(USE_CUDA)
+    target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC)
  endif()
-  if(USE_UCC AND USE_C10D_UCC)
-    target_compile_definitions(torch_cpu PUBLIC USE_C10D_UCC)
-    if(USE_CUDA)
-      target_compile_definitions(torch_cuda PUBLIC USE_C10D_UCC)
-    endif()
+endif()
+if(USE_NCCL AND USE_C10D_NCCL)
+  if(USE_ROCM)
+    target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
+  else()
+    target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
  endif()
-  if(USE_NCCL AND USE_C10D_NCCL)
-    if(USE_ROCM)
-      target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
-    else()
-      target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
-    endif()
-  endif()
-  if(USE_MPI AND USE_C10D_MPI)
-    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-      set_source_files_properties(
-        "${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupMPI.cpp"
-        PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
-    endif()
-    target_compile_definitions(torch_cpu PUBLIC USE_C10D_MPI)
-  endif()
-  # Pass USE_RPC in order to reduce use of
-  # #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
-  # need to be removed when RPC is supported
-  if(NOT WIN32)
-    target_compile_definitions(torch_cpu PUBLIC USE_RPC)
-  endif()
-  # Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
-  # can only be compiled with USE_TENSORPIPE is set.
-  if(USE_TENSORPIPE)
-    target_compile_definitions(torch_cpu PUBLIC USE_TENSORPIPE)
+endif()
+if(USE_MPI AND USE_C10D_MPI)
+  if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    set_source_files_properties(
+      "${TORCH_SRC_DIR}/csrc/distributed/c10d/ProcessGroupMPI.cpp"
+      PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations)
  endif()
+  target_compile_definitions(torch_cpu PUBLIC USE_C10D_MPI)
+endif()
+# Pass USE_RPC in order to reduce use of
+# #if defined(USE_DISTRIBUTED) && !defined(_WIN32)
+# need to be removed when RPC is supported
+if(NOT WIN32)
+  target_compile_definitions(torch_cpu PUBLIC USE_RPC)
+endif()
+# Pass USE_TENSORPIPE to torch_cpu as some parts of rpc/utils.cpp
+# can only be compiled with USE_TENSORPIPE is set.
+if(USE_TENSORPIPE)
+  target_compile_definitions(torch_cpu PUBLIC USE_TENSORPIPE)
 endif()

 if(NOT INTERN_BUILD_MOBILE)
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@ -1134,7 +1134,7 @@ if(USE_CUDA AND CUDA_VERSION VERSION_LESS 13.0)
  include_directories(SYSTEM ${CUB_INCLUDE_DIRS})
 endif()

-if(USE_DISTRIBUTED AND USE_TENSORPIPE)
+if(USE_TENSORPIPE)
  if(MSVC)
    message(WARNING "Tensorpipe cannot be used on Windows.")
  else()
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@ -193,13 +193,11 @@ function(caffe2_print_configuration_summary)
  message(STATUS "  USE_PYTORCH_QNNPACK   : ${USE_PYTORCH_QNNPACK}")
  message(STATUS "  USE_XNNPACK           : ${USE_XNNPACK}")
  message(STATUS "  USE_DISTRIBUTED       : ${USE_DISTRIBUTED}")
-  if(${USE_DISTRIBUTED})
-    message(STATUS "    USE_MPI               : ${USE_MPI}")
-    message(STATUS "    USE_GLOO              : ${USE_GLOO}")
-    message(STATUS "    USE_GLOO_WITH_OPENSSL : ${USE_GLOO_WITH_OPENSSL}")
-    message(STATUS "    USE_GLOO_IBVERBS      : ${USE_GLOO_IBVERBS}")
-    message(STATUS "    USE_TENSORPIPE        : ${USE_TENSORPIPE}")
-  endif()
+  message(STATUS "    USE_MPI               : ${USE_MPI}")
+  message(STATUS "    USE_GLOO              : ${USE_GLOO}")
+  message(STATUS "    USE_GLOO_WITH_OPENSSL : ${USE_GLOO_WITH_OPENSSL}")
+  message(STATUS "    USE_GLOO_IBVERBS      : ${USE_GLOO_IBVERBS}")
+  message(STATUS "    USE_TENSORPIPE        : ${USE_TENSORPIPE}")
  if(NOT "${SELECTED_OP_LIST}" STREQUAL "")
    message(STATUS "  SELECTED_OP_LIST    : ${SELECTED_OP_LIST}")
  endif()
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -3305,13 +3305,6 @@ def coverage_post_process(app, exception):
    if not isinstance(app.builder, CoverageBuilder):
        return

-    if not torch.distributed.is_available():
-        raise RuntimeError(
-            "The coverage tool cannot run with a version "
-            "of PyTorch that was built with USE_DISTRIBUTED=0 "
-            "as this module's API changes."
-        )
-
    # These are all the modules that have "automodule" in an rst file
    # These modules are the ones for which coverage is checked
    # Here, we make sure that no module is missing from that list
--- a/docs/source/fx.md
+++ b/docs/source/fx.md
@ -1093,9 +1093,6 @@ The set of leaf modules can be customized by overriding
 ```{eval-rst}
 .. autofunction:: torch.fx.replace_pattern
 ```
-```{eval-rst}
-.. autofunction:: torch.fx.traceback.annotate
-```

 <!-- The experimental and passes submodules are missing docs. -->
 <!-- Adding it here for coverage but this doesn't add anything to the -->
--- a/pt_template_srcs.bzl
+++ b/pt_template_srcs.bzl
@ -156,7 +156,6 @@ def get_generate_code_bin_outs():
            "autograd/generated/python_torch_functions_1.cpp": ["autograd/generated/python_torch_functions_1.cpp"],
            "autograd/generated/python_torch_functions_2.cpp": ["autograd/generated/python_torch_functions_2.cpp"],
            "autograd/generated/python_variable_methods.cpp": ["autograd/generated/python_variable_methods.cpp"],
-            "functionalization/generated/ViewMetaClassesPythonBinding.cpp": ["functionalization/generated/ViewMetaClassesPythonBinding.cpp"],
        })
    return outs

--- a/setup.py
+++ b/setup.py
@ -1704,18 +1704,7 @@ def main() -> None:
    package_data = {
        "torch": torch_package_data,
    }
-    # some win libraries are excluded
-    # these are statically linked
-    exclude_windows_libs = [
-        "lib/dnnl.lib",
-        "lib/kineto.lib",
-        "lib/libprotobuf-lite.lib",
-        "lib/libprotobuf.lib",
-        "lib/libprotoc.lib",
-    ]
-    exclude_package_data = {
-        "torch": exclude_windows_libs,
-    }
+    exclude_package_data = {}

    if not BUILD_LIBTORCH_WHL:
        package_data["torchgen"] = torchgen_package_data
--- a/test/cpp/dist_autograd/CMakeLists.txt
+++ b/test/cpp/dist_autograd/CMakeLists.txt
@ -1,4 +1,4 @@
-if(USE_DISTRIBUTED AND NOT WIN32)
+if(NOT WIN32)
  set(DIST_AUTOGRAD_TEST_DIR "${TORCH_ROOT}/test/cpp/dist_autograd")
  set(DIST_AUTOGRAD_TEST_SOURCES
    ${TORCH_ROOT}/test/cpp/common/main.cpp
--- a/test/cpp_extensions/open_registration_extension/torch_openreg/cmake/TorchPythonTargets.cmake
+++ b/test/cpp_extensions/open_registration_extension/torch_openreg/cmake/TorchPythonTargets.cmake
@ -1,7 +1,9 @@
 if(WIN32)
-  set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/${CMAKE_IMPORT_LIBRARY_PREFIX}torch_python${CMAKE_IMPORT_LIBRARY_SUFFIX}")
+  set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/torch_python.lib")
+elseif(APPLE)
+  set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/libtorch_python.dylib")
 else()
-  set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}torch_python${CMAKE_SHARED_LIBRARY_SUFFIX}")
+  set(TORCH_PYTHON_IMPORTED_LOCATION "${PYTORCH_INSTALL_DIR}/lib/libtorch_python.so")
 endif()

 add_library(torch_python SHARED IMPORTED)
--- a/test/distributed/flight_recorder/test_fr_analysis.py
+++ b/test/distributed/flight_recorder/test_fr_analysis.py
@ -143,19 +143,6 @@ class FlightRecorderEventTest(TestCase):
            match_one_event(e11, e12, membership, "0").state,
            MatchState.FULLY_MATCHED,
        )
-        e13 = create_one_event(
-            "gather",
-            ("0", "default"),
-            [[4, 4]],
-            [[4, 4]],
-            "completed",
-            1,
-            output_dtypes="",
-        )
-        self.assertEqual(
-            match_one_event(e11, e13, membership, "0").state,
-            MatchState.FULLY_MATCHED,
-        )

    def test_all_events(self):
        for collective in sorted(COLLECTIVES):
--- a/test/distributed/pipelining/test_schedule.py
+++ b/test/distributed/pipelining/test_schedule.py
@ -202,62 +202,6 @@ class ScheduleTest(TestCase):

        torch.distributed.destroy_process_group()

-    @parametrize(
-        "ScheduleClass",
-        [
-            Schedule1F1B,
-            ScheduleGPipe,
-            ScheduleInterleaved1F1B,
-            ScheduleInterleavedZeroBubble,
-            ScheduleLoopedBFS,
-        ],
-    )
-    def test_schedule_eval_then_train(self, ScheduleClass):
-        """
-        Test that simply runs evaluation followed by training.
-        """
-        store = FakeStore()
-        torch.distributed.init_process_group(
-            backend="fake", rank=0, world_size=1, store=store
-        )
-        d_hid, batch_size = 512, 256
-        n_stages = 1
-        device = "cpu"
-        full_mod = MultiMLP(d_hid, n_layers=n_stages)
-        full_mod.to(device)
-
-        x = torch.randn(batch_size, d_hid, device=device)
-        target = torch.randn(batch_size, d_hid, device=device)
-
-        def loss_fn(y, target):
-            return torch.nn.functional.cross_entropy(y, target)
-
-        submod_name = "layers.0"
-        stage_module = full_mod.get_submodule(submod_name)
-
-        # Create a pipeline stage to wrap that submodule
-        num_microbatches = 2
-        stages = [PipelineStage(stage_module, 0, n_stages, device)]
-
-        if issubclass(ScheduleClass, PipelineScheduleSingle):
-            stages = stages[0]
-
-        # Attach to a schedule
-        schedule = ScheduleClass(stages, num_microbatches, loss_fn=loss_fn)
-        # Run eval
-        for _ in range(2):
-            # Zero gradients
-            stage_module.zero_grad()
-            losses = []
-            schedule.eval(x, target=target, losses=losses)
-        # Run training
-        try:
-            for _ in range(2):
-                losses = []
-                schedule.step(x, target=target, losses=losses)
-        finally:
-            torch.distributed.destroy_process_group()
-
    def test_zero_bubble_schedule_errors_with_compile(self):
        """
        Test that zero bubble schedules raise an error when used with torch.compile.
--- a/test/distributed/tensor/parallel/test_micro_pipeline_tp.py
+++ b/test/distributed/tensor/parallel/test_micro_pipeline_tp.py
@ -352,7 +352,7 @@ class MicroPipelineTPTest(TestCase):
    @parametrize("scatter_dim", [0, 1, 2])
    @fresh_cache()
    def test_fuse_scaled_matmul_reduce_scatter(self, A_dims, scatter_dim):
-        if scatter_dim >= A_dims - 1:
+        if scatter_dim >= A_dims:
            return

        group = dist.group.WORLD
@ -402,7 +402,7 @@ class MicroPipelineTPTest(TestCase):

    @runOnRocmArch(MI300_ARCH)
    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
-    @parametrize("scatter_dim", [0, 1])
+    @parametrize("scatter_dim", [0, 1, 2])
    @fresh_cache()
    def test_fuse_scaled_matmul_reduce_scatter_rowwise_scales_reshape_mm_reshape(
        self, scatter_dim
--- a/test/distributed/tensor/test_fake.py
+++ b/test/distributed/tensor/test_fake.py
@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+# Owner(s): ["oncall: distributed"]
+
+import torch
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.distributed.tensor import DTensor
+from torch.distributed.tensor.placement_types import Shard
+from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.testing._internal.distributed.fake_pg import FakeStore
+
+
+class TestFakeDTensor(TestCase):
+    def test_fake_dtensor_operations(self):
+        # Use FakeTensorMode to handle CUDA tensors without actual CUDA
+        fake_mode = FakeTensorMode()
+        world_size = 4
+
+        fake_store = FakeStore()
+        torch.distributed.init_process_group(
+            "fake", store=fake_store, rank=0, world_size=world_size
+        )
+        device_mesh = torch.distributed.device_mesh.init_device_mesh(
+            "cuda",
+            (2, world_size // 2),
+        )
+
+        # Create fake CUDA tensor using FakeTensorMode
+        with fake_mode:
+            x = torch.randn(1, 1, device="cuda")
+            x = DTensor.from_local(x, device_mesh, [Shard(0), Shard(1)])
+
+            # Test basic DTensor operations
+            self.assertIsInstance(x, DTensor)
+
+            # Test sum operation
+            r = x.sum(1)
+            self.assertIsInstance(r, DTensor)
+
+
+if __name__ == "__main__":
+    run_tests()
--- a/test/distributed/tensor/test_math_ops.py
+++ b/test/distributed/tensor/test_math_ops.py
@ -880,34 +880,6 @@ class DistMathOpsTest(DTensorTestBase):
                out_full = out_dt.full_tensor()
                self.assertEqual(global_bins, out_full)

-    @with_comms
-    def test_logsumexp(self):
-        mesh = self.build_device_mesh()
-        comm_mode = CommDebugMode()
-        inp = torch.rand(3, 5, device=self.device_type)
-
-        shard_dim = 0
-        input_dtensor = distribute_tensor(
-            inp, device_mesh=mesh, placements=[Shard(shard_dim)]
-        )
-
-        logsumexp_dims = [0, 1]
-        for dim in logsumexp_dims:
-            output = torch.logsumexp(inp, dim=dim)
-            with comm_mode:
-                output_dtensor = torch.logsumexp(input_dtensor, dim=dim)
-                if dim == shard_dim:
-                    self.assertEqual(comm_mode.get_total_counts(), 1)
-                    self.assertEqual(
-                        comm_mode.get_comm_counts()[funcol.all_gather_into_tensor],
-                        1,
-                    )
-                    self.assertTrue(output_dtensor.placements[0].is_replicate())
-                else:
-                    self.assertEqual(comm_mode.get_total_counts(), 0)
-                    self.assertTrue(output_dtensor.placements[0].is_shard(shard_dim))
-                self.assertEqual(output_dtensor.full_tensor(), output)
-

 if __name__ == "__main__":
    run_tests()
--- a/test/distributed/test_symmetric_memory.py
+++ b/test/distributed/test_symmetric_memory.py
@ -505,7 +505,7 @@ class AsyncTPTest(MultiProcContinuousTest):
        not PLATFORM_SUPPORTS_SYMM_MEM, "SymmMem is not supported on this ROCm arch"
    )
    @skip_if_lt_x_gpu(2)
-    @parametrize("scatter_dim", [0, 1, 2])
+    @parametrize("scatter_dim", [0, 1])
    def test_fused_matmul_reduce_scatter(self, scatter_dim: int) -> None:
        self._init_process()

--- a/test/dynamo/test_aot_autograd_cache.py
+++ b/test/dynamo/test_aot_autograd_cache.py
@ -519,7 +519,11 @@ class AOTAutogradCacheTests(InductorTestCase):
    @functorch_config.patch(
        {"enable_autograd_cache": True, "view_replay_for_aliased_outputs": True}
    )
-    def test_view_replay(self):
+    def test_view_replay_bypass(self):
+        """
+        Should bypass when view replay is turned on
+        """
+
        def fn(a):
            tmp = a.detach()
            a.mul_(2)
@ -527,25 +531,10 @@ class AOTAutogradCacheTests(InductorTestCase):

        with torch.autograd._force_original_view_tracking(True):
            compiled_fn = torch.compile(fn)
+            compiled_fn(torch.rand(2, 3))

-        def run_and_check(miss, hit, bypass):
-            self._clear_dynamo_and_codecache()
-
-            inp = torch.rand(2, 3)
-            compiled_inp = inp.clone().detach()
-
-            with torch.autograd._force_original_view_tracking(True):
-                out = fn(inp)
-                compiled_out = compiled_fn(compiled_inp)
-
-            self.assertEqual(out, compiled_out)
-            self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], miss)
-            self.assertEqual(counters["aot_autograd"]["autograd_cache_hit"], hit)
-            self.assertEqual(counters["aot_autograd"]["autograd_cache_bypass"], bypass)
-
-        run_and_check(miss=1, hit=0, bypass=0)
-        run_and_check(miss=1, hit=1, bypass=0)
-        run_and_check(miss=1, hit=2, bypass=0)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_miss"], 1)
+        self.assertEqual(counters["aot_autograd"]["autograd_cache_bypass"], 1)

    @inductor_config.patch("fx_graph_remote_cache", False)
    @inductor_config.patch("fx_graph_cache", True)
--- a/test/export/test_export.py
+++ b/test/export/test_export.py
@ -21,7 +21,6 @@ from unittest.mock import MagicMock, patch

 import torch
 import torch._dynamo as torchdynamo
-import torch.fx.traceback as fx_traceback
 import torch.nn.functional as F
 import torch.utils._pytree as pytree
 from functorch.experimental.control_flow import cond, map
@ -62,10 +61,7 @@ from torch.export.passes import move_to_device_pass
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.fx.experimental.symbolic_shapes import ShapeEnv
 from torch.testing import FileCheck
-from torch.testing._internal.common_cuda import (
-    PLATFORM_SUPPORTS_FLASH_ATTENTION,
-    xfailIfDistributedNotSupported,
-)
+from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FLASH_ATTENTION
 from torch.testing._internal.common_utils import (
    find_library_location,
    IS_FBCODE,
@ -15072,39 +15068,6 @@ def forward(self, x):
            test_serdes=True,
        )

-    # TODO: following tests should be fixed
-    @testing.expectedFailureTrainingIRToRunDecomp
-    @testing.expectedFailureTrainingIRToRunDecompNonStrict
-    def test_preserve_annotation(self):
-        class M(torch.nn.Module):
-            def forward(self, x):
-                with fx_traceback.annotate({"pp_stage": 0}):
-                    with fx_traceback.annotate({"fdsp_bucket": 0}):
-                        x = x + 1
-                    x = x - 2
-                    with fx_traceback.annotate({"cuda_stream": 2, "fsdp_bucket": 1}):
-                        x = x * 2
-                x = x / 3
-                return x
-
-        m = M()
-
-        with fx_traceback.preserve_node_meta():
-            ep = export(m, (torch.randn(10),))
-
-        for node in ep.graph.nodes:
-            if node.target == torch.ops.aten.add.default:
-                self.assertTrue(node.meta["custom"], {"pp_stage": 0, "fdsp_bucket": 0})
-            if node.target == torch.ops.aten.sub.default:
-                self.assertTrue(node.meta["custom"], {"pp_stage": 0})
-            if node.target == torch.ops.aten.mul.default:
-                self.assertTrue(
-                    node.meta["custom"],
-                    {"pp_stage": 0, "cuda_stream": 2, "fsdp_bucket": 1},
-                )
-            if node.target == torch.ops.aten.div.default:
-                self.assertTrue(node.meta["custom"], {})
-
    def test_dynamic_shapes_serdes_generic(self):
        from torch._export.serde.dynamic_shapes import (
            _dump_dynamic_shapes,
@ -15824,7 +15787,6 @@ class GraphModule(torch.nn.Module):
        finally:
            torch.distributed.destroy_process_group()

-    @xfailIfDistributedNotSupported
    def test_distributed_all_reduce(self):
        class Foo(torch.nn.Module):
            def __init__(self):
@ -15842,7 +15804,6 @@ class GraphModule(torch.nn.Module):
            inp = (torch.randn(4, 4),)
            self.assertTrue(torch.allclose(ep.module()(*inp), m(*inp)))

-    @xfailIfDistributedNotSupported
    def test_distributed_all_gather(self):
        class Foo(torch.nn.Module):
            def forward(self, x):
@ -15858,7 +15819,6 @@ class GraphModule(torch.nn.Module):
                torch.allclose(a, b) for a, b in zip(ep.module()(*inp), m(*inp))
            )

-    @xfailIfDistributedNotSupported
    def test_distributed_all_gather_into_tensor(self):
        class Foo(torch.nn.Module):
            def forward(self, x):
@ -15872,7 +15832,6 @@ class GraphModule(torch.nn.Module):
            inp = (torch.randn(2),)
            self.assertTrue(torch.allclose(ep.module()(*inp), m(*inp)))

-    @xfailIfDistributedNotSupported
    @testing.expectedFailureCppRuntime
    def test_distributed_all_to_all_single(self):
        class Foo(torch.nn.Module):
@ -15890,7 +15849,6 @@ class GraphModule(torch.nn.Module):
            )
            self.assertEqual(len(nodes), 1)

-    @xfailIfDistributedNotSupported
    @testing.expectedFailureCppRuntime
    def test_distributed_reduce_scatter_tensor(self):
        class Foo(torch.nn.Module):
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@ -8500,6 +8500,7 @@ class TestAOTAutogradWithCache(TestAOTAutogradWithDynamo):
        {
            "enable_autograd_cache": True,
            "strict_autograd_cache": True,
+            "view_replay_for_aliased_outputs": False,
        }
    )
    @torch._inductor.config.patch("fx_graph_cache", True)
--- a/test/inductor/test_fxir_backend.py
+++ b/test/inductor/test_fxir_backend.py
@ -20,7 +20,11 @@ from torch._inductor import config
 from torch._inductor.codegen.cpp import CppScheduling
 from torch._inductor.codegen.triton import TritonScheduling
 from torch._inductor.codegen.wrapper import PythonWrapperCodegen
-from torch._inductor.codegen.wrapper_fxir import FxConverter, WrapperFxCodegen
+from torch._inductor.codegen.wrapper_fxir import (
+    FxConverter,
+    replace_floor_div,
+    WrapperFxCodegen,
+)
 from torch._inductor.test_case import TestCase as InductorTestCase
 from torch.export import Dim
 from torch.testing._internal.common_utils import (
@ -34,6 +38,7 @@ from torch.testing._internal.inductor_utils import (
    requires_gpu,
    TRITON_HAS_CPU,
 )
+from torch.utils._sympy.functions import FloorDiv


 if HAS_GPU:
@ -483,10 +488,11 @@ class FxirTestCase(InductorTestCase):
        )
        self.assertIn("ks0", triton_node.kwargs["kwargs"])

-    def test_dynamic_launch_grid_calc_python(self):
+    def test_dynamic_launch_grid_calc(self):
        """
-        Test the dyanmic launch grid calculation for Triton kernel wrapper using python mode
+        Test the dyanmic launch grid calculation.
        """
+
        func = torch.add
        args = [torch.randn(shape, device=self.device) for shape in [(7, 12), (7, 1)]]
        (gm,) = self._compile_and_check(func, args, compile_kwargs={"dynamic": True})
@ -505,41 +511,6 @@ class FxirTestCase(InductorTestCase):
        self.assertEqual(grid[1], 1)
        self.assertEqual(grid[2], 1)

-    def test_dynamic_launch_grid_calc_python_slow(self):
-        """
-        Test the dyanmic launch grid calculation for Triton kernel wrapper using python_slow mode
-        """
-        from torch._inductor.runtime.triton_heuristics import GridExpr
-
-        # Mock GridExpr.from_meta to use "python_slow" mode explicitly
-        original_from_meta = GridExpr.from_meta
-
-        def mocked_from_meta(inductor_meta, cfg, mode="python"):
-            return original_from_meta(inductor_meta, cfg, mode="python_slow")
-
-        with unittest.mock.patch.object(GridExpr, "from_meta", mocked_from_meta):
-            func = torch.add
-            args = [
-                torch.randn(shape, device=self.device) for shape in [(7, 12), (7, 1)]
-            ]
-            (gm,) = self._compile_and_check(
-                func, args, compile_kwargs={"dynamic": True}
-            )
-
-            # Check for the precomputed size arg.
-            (triton_node,) = gm.graph.find_nodes(
-                op="call_function", target=triton_kernel_wrapper_mutation
-            )
-            self.assertIn("grid", triton_node.kwargs)
-            self.assertIn("xnumel", triton_node.kwargs["kwargs"])
-            self.assertIn("XBLOCK", triton_node.kwargs["kwargs"])
-            grid = triton_node.kwargs["grid"][0]
-            xnumel = triton_node.kwargs["kwargs"]["xnumel"].meta["val"]
-            xblock = triton_node.kwargs["kwargs"]["XBLOCK"]
-            self.assertEqual(grid[0].meta["val"], ((xnumel + xblock - 1) // xblock))
-            self.assertEqual(grid[1], 1)
-            self.assertEqual(grid[2], 1)
-
    @config.patch({"trace.enabled": True})
    @unittest.mock.patch("torch._inductor.debug.DebugFormatter.output_code")
    def test_debug(self, mock_output_code):
@ -990,6 +961,29 @@ def forward(self, arg0_1, arg1_1, arg2_1):
    return [buf1, buf2]""",  # noqa: B950
        )

+    def test_dims_dynamic_outer_static_padded_inner(self):
+        """
+        Test padding on inner dimensions, with dynamic outer dimensions.
+        """
+
+        class M(torch.nn.Module):
+            def forward(self, x, y):
+                return x + y
+
+        def get_input_padded_inner(shape):
+            full_shape = shape[:-1] + (shape[-1] * 2,)
+            full = torch.randn(full_shape, dtype=torch.float32, device=self.device)
+            view = torch.as_strided(full, shape, full.stride())
+            return view
+
+        shape = (4, 4, 4)
+        args = tuple(get_input_padded_inner(shape) for _ in range(2))
+        self.check(
+            M(),
+            args,
+            dynamic_shapes=({0: Dim.DYNAMIC, 1: Dim.DYNAMIC, 2: Dim.STATIC},) * 2,
+        )
+
    @parametrize("length", (4, 8))
    def test_cond_dynamic_shape_pred_scalar_closure(self, length: int):
        """
@ -1033,6 +1027,132 @@ def forward(self, arg0_1, arg1_1, arg2_1):
        self.check(M(), (x,), dynamic_shapes=({0: Dim.DYNAMIC},))


+class TestReplaceFloorDiv(InductorTestCase):
+    """
+    Tests for floor -> FloorDiv conversion.
+    """
+
+    def _check(self, expr: sympy.Expr) -> sympy.Expr:
+        # Check that we started with floor's.
+        num_floors = expr.count(sympy.floor)
+        self.assertGreater(num_floors, 0)
+
+        replaced = replace_floor_div(expr)
+
+        # Check that all floor's were replaced.
+        # We shoud have no more new FloorDiv's than floor's in the original expression,
+        # although we can have less due to simplification.
+        self.assertEqual(replaced.count(sympy.floor), 0)
+        self.assertLessEqual(
+            replaced.count(FloorDiv) - expr.count(FloorDiv), num_floors
+        )
+
+        def expand_floor_div(
+            numerator: sympy.Expr, denominator: sympy.Expr
+        ) -> sympy.Expr:
+            return sympy.floor(numerator / denominator)
+
+        # Expand FloorDiv back into floor and check for equality.
+        self.assertEqual(
+            *[
+                sympy.simplify(e.replace(FloorDiv, expand_floor_div))
+                for e in (replaced, expr)
+            ]
+        )
+
+        return replaced
+
+    def test_rewrite_floor_div_mul_pow(self):
+        x, y = sympy.symbols("x y")
+        expr = sympy.floor(x / y)
+        self.assertEqual(expr.count(FloorDiv), 0)
+        self.assertEqual(expr.count(sympy.core.mul.Mul), 1)
+        self.assertEqual(expr.count(sympy.Pow), 1)
+
+        rewritten = self._check(expr)
+        self.assertTrue(isinstance(rewritten, FloorDiv))
+        self.assertEqual(rewritten.args, (x, y))
+
+    def test_rewrite_floor_div_mul_rational(self):
+        x = sympy.Symbol("x")
+        expr = sympy.floor(x / 5)
+        self.assertEqual(expr.count(FloorDiv), 0)
+        self.assertEqual(expr.count(sympy.core.mul.Mul), 1)
+        self.assertEqual(expr.count(sympy.Rational), 1)
+
+        rewritten = self._check(expr)
+        self.assertTrue(isinstance(rewritten, FloorDiv))
+        self.assertEqual(rewritten.args, (x, 5))
+
+    def test_no_rewrite_div(self):
+        x, y = sympy.symbols("x y")
+        expr = x / y
+        self.assertEqual(expr.count(FloorDiv), 0)
+
+        rewritten = replace_floor_div(expr)
+        self.assertEqual(rewritten, expr)
+
+    def test_rewrite_floor_div_nested(self):
+        x, y = sympy.symbols("x y")
+        expr = sympy.floor((sympy.floor(x / 5) + 1) / y)
+        self.assertEqual(expr.count(FloorDiv), 0)
+
+        rewritten = self._check(expr)
+        self.assertEqual(rewritten.count(FloorDiv), 2)
+
+    def test_rewrite_floor_div_rational_const(self):
+        expr = sympy.floor(sympy.S.One / 5, evaluate=False)
+        self.assertEqual(expr.count(FloorDiv), 0)
+        self.assertEqual(expr.count(sympy.Mul), 0)
+        self.assertEqual(expr.count(sympy.Rational), 1)
+
+        # Expression evaluates to a compile time constant
+        rewritten = self._check(expr)
+        self.assertEqual(rewritten, sympy.S.Zero)
+
+    def test_no_distribute_mul_floordiv(self):
+        """
+        Test that multiplication doesn't distribute with floor division.
+        """
+        x = sympy.Symbol("x")
+        expr = 2 * sympy.floor(x / 2)
+        rewritten = self._check(expr)
+        self.assertEqual(rewritten.count(sympy.Mul), 1)
+        self.assertEqual(rewritten.count(FloorDiv), 1)
+
+    def test_rational_multi_pows(self):
+        """
+        Test an expression with a rational and multiple pows.
+        """
+        x, y, z = sympy.symbols("x y z")
+        expr = sympy.floor((x / 5) * (y**2) * (z**3))
+        mul = expr.args[0]
+        self.assertTrue(isinstance(mul, sympy.Mul))
+        self.assertTrue(isinstance(mul.args[0], sympy.Rational))
+        self.assertEqual(expr.count(sympy.Pow), 2)
+        rewritten = self._check(expr)
+        self.assertEqual(rewritten.count(FloorDiv), 1)
+
+    def test_variable_exp(self):
+        """
+        Test pow when the exponent is a variable.
+        """
+        x = sympy.Symbol("x", positive=True)
+        expr = sympy.floor(2**-x)
+        replaced = self._check(expr)
+
+        # Check that x went to the denominator.
+        self.assertEqual(replaced.args, (1, 2**x))
+
+    def test_launch_grid_dynamic_padding(self):
+        """
+        Test a complex launch grid expression arising from padding with dynamic shapes.
+        """
+        x, y = sympy.symbols("x y")
+        expr = sympy.floor(-FloorDiv(x * y, 2) / FloorDiv(-x * y, 131070))
+        self._check(expr)
+
+
 if __name__ == "__main__":
    from torch._inductor.test_case import run_tests

--- a/test/profiler/test_profiler.py
+++ b/test/profiler/test_profiler.py
@ -3238,40 +3238,6 @@ aten::mm""",
            assert "Overload Name" in key_averages.table()
            validate_json(prof)

-    def test_expose_kineto_event_metadata(self):
-        def check_metadata(prof, op_name, metadata_key):
-            with TemporaryFileName(mode="w+") as fname:
-                prof.export_chrome_trace(fname)
-                with open(fname) as f:
-                    events = json.load(f)["traceEvents"]
-                    found_op = False
-                    for e in events:
-                        if "name" in e and "args" in e and e["name"] == op_name:
-                            assert metadata_key in e["args"], (
-                                f"Metadata for '{op_name}' in Chrome trace did not contain '{metadata_key}'."
-                            )
-                            found_op = True
-                    assert found_op, f"Could not find op '{op_name}' in Chrome trace."
-                found_op = False
-                for event in prof.events():
-                    if event.name == op_name:
-                        assert metadata_key in event.metadata_json, (
-                            f"Metadata for '{op_name}' in FunctionEvent did not contain '{metadata_key}'."
-                        )
-                        found_op = True
-                assert found_op, f"Could not find op '{op_name}' in prof.events()."
-
-        experimental_config = torch._C._profiler._ExperimentalConfig(
-            expose_kineto_event_metadata=True
-        )
-        with profile(
-            experimental_config=experimental_config,
-            activities=[ProfilerActivity.CPU],
-        ) as prof:
-            torch.add(1, 5)
-
-        check_metadata(prof, op_name="aten::add", metadata_key="Ev Idx")
-
    @unittest.skipIf(not torch.cuda.is_available(), "requries CUDA")
    def test_profiler_debug_autotuner(self):
        """
--- a/test/test_numa_binding.py
+++ b/test/test_numa_binding.py
@ -7,7 +7,7 @@ import sys
 from dataclasses import dataclass
 from multiprocessing.context import SpawnProcess
 from typing import Any, Optional
-from unittest import skipUnless
+from unittest import skipIf, skipUnless
 from unittest.mock import mock_open, patch

 import torch
@ -22,7 +22,7 @@ from torch.numa.binding import (
    AffinityMode,
    NumaOptions,
 )
-from torch.testing._internal.common_utils import run_tests, TestCase
+from torch.testing._internal.common_utils import IS_MACOS, run_tests, TestCase


@dataclass(frozen=True)
@ -680,6 +680,7 @@ class NumaBindingTest(TestCase):
            set(range(0, 2)),
        )

+    @skipIf(IS_MACOS, "sched_getaffinity doesn't exist")
    def test_binds_to_node_0_if_node_stored_as_minus_one(self) -> None:
        self._add_mock_hardware(
            num_sockets=1,
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@ -2488,9 +2488,9 @@ class TestSparseCSR(TestCase):
            self.assertEqual(a.grad, a1.grad)
            self.assertEqual(b.grad, b1.grad)

-    @skipCUDAIfRocm
    @onlyCUDA
-    @skipCUDAIf(True, "Causes CUDA memory exception, see https://github.com/pytorch/pytorch/issues/72177")
+    # It works on ROCm and CUDA issue is currently active
+    @skipCUDAIf(not TEST_WITH_ROCM, "Causes CUDA memory exception, see https://github.com/pytorch/pytorch/issues/72177")
    @dtypes(torch.float32, torch.float64, torch.complex64, torch.complex128)
    @precisionOverride({torch.float32: 1e-3, torch.complex64: 1e-3,
                        torch.float64: 1e-8, torch.complex128: 1e-8})
--- a/tools/build_pytorch_libs.py
+++ b/tools/build_pytorch_libs.py
@ -88,8 +88,7 @@ def build_pytorch(
 ) -> None:
    my_env = _create_build_env()
    if (
-        not check_negative_env_flag("USE_DISTRIBUTED")
-        and not check_negative_env_flag("USE_CUDA")
+        not check_negative_env_flag("USE_CUDA")
        and not check_negative_env_flag("USE_NCCL")
        and not check_env_flag("USE_SYSTEM_NCCL")
    ):
--- a/tools/flight_recorder/components/types.py
+++ b/tools/flight_recorder/components/types.py
@ -469,30 +469,6 @@ class Op:
            f"{p2p_info}, " if p2p_info else ""
        )

-    def dtype_mismatch(self, other: "Op") -> bool:
-        if (
-            (
-                self.type not in ["scatter", "gather", "broadcast"]
-                and set(self.input_dtypes) != set(self.output_dtypes)
-                and self.input_sizes[0]
-                and self.output_sizes[0]
-            )
-            or (
-                self.type not in ["scatter", "broadcast"]
-                and set(self.input_dtypes) != set(other.input_dtypes)
-                and self.input_sizes[0]
-                and other.input_sizes[0]
-            )
-            or (
-                self.type not in ["gather"]
-                and set(self.output_dtypes) != set(other.output_dtypes)
-                and self.output_sizes[0]
-                and other.output_sizes[0]
-            )
-        ):
-            return True
-        return False
-
    def match(self, other: "Op") -> MatchInfo:
        # TODO: I think this can validly not match,
        # e.g. if one PG was used for p2p ops between only some of the peers?
@ -534,7 +510,23 @@ class Op:
                    MatchState.COLLECTIVE_STATE_MISMATCH,
                    f"Expected state: '{self.state}' does not match found state: '{other.state}'",
                )
-            if self.dtype_mismatch(other):
+            if (
+                (
+                    set(self.input_dtypes) != set(self.output_dtypes)
+                    and self.input_sizes[0]
+                    and self.output_sizes[0]
+                )
+                or (
+                    set(self.input_dtypes) != set(other.input_dtypes)
+                    and self.input_sizes[0]
+                    and other.input_sizes[0]
+                )
+                or (
+                    set(self.input_dtypes) != set(other.output_dtypes)
+                    and self.input_sizes[0]
+                    and other.output_sizes[0]
+                )
+            ):
                return MatchInfo(
                    MatchState.COLLECTIVE_DTYPE_MISMATCH,
                    f"Expected dtypes: '{set(self.input_dtypes)}' does not "
--- a/tools/setup_helpers/generate_code.py
+++ b/tools/setup_helpers/generate_code.py
@ -189,12 +189,6 @@ def main() -> None:
    )
    options = parser.parse_args()

-    # Path: aten/src/ATen
-    aten_path = os.path.dirname(os.path.dirname(options.native_functions_path))
-    operator_selector = get_selector(
-        options.selected_op_list_path, options.operators_yaml_path
-    )
-
    generate_code(
        options.gen_dir,
        options.native_functions_path,
@ -204,37 +198,18 @@ def main() -> None:
        options.disable_autograd,
        options.force_schema_registration,
        # options.selected_op_list
-        operator_selector=operator_selector,
-    )
-
-    # Generate the python bindings for functionalization's `ViewMeta` classes.
-    from torchgen.gen_functionalization_type import (
-        gen_functionalization_view_meta_classes,
-    )
-
-    functionalization_templates_dir = os.path.join(aten_path, "templates")
-    install_dir = options.install_dir or os.fspath(options.gen_dir / "torch/csrc")
-    functionalization_install_dir = os.path.join(
-        install_dir, "functionalization", "generated"
-    )
-
-    os.makedirs(functionalization_install_dir, exist_ok=True)
-    assert os.path.isdir(functionalization_install_dir)
-    assert os.path.isdir(functionalization_templates_dir)
-
-    gen_functionalization_view_meta_classes(
-        options.native_functions_path or NATIVE_FUNCTIONS_PATH,
-        options.tags_path or TAGS_PATH,
-        selector=operator_selector,
-        install_dir=functionalization_install_dir,
-        template_dir=functionalization_templates_dir,
+        operator_selector=get_selector(
+            options.selected_op_list_path, options.operators_yaml_path
+        ),
    )

    if options.gen_lazy_ts_backend:
+        aten_path = os.path.dirname(os.path.dirname(options.native_functions_path))
        ts_backend_yaml = os.path.join(aten_path, "native/ts_native_functions.yaml")
        ts_native_functions = "torch/csrc/lazy/ts_backend/ts_native_functions.cpp"
        ts_node_base = "torch/csrc/lazy/ts_backend/ts_node.h"
-        lazy_install_dir = os.path.join(install_dir, "lazy", "generated")
+        install_dir = options.install_dir or os.fspath(options.gen_dir / "torch/csrc")
+        lazy_install_dir = os.path.join(install_dir, "lazy/generated")
        os.makedirs(lazy_install_dir, exist_ok=True)

        assert os.path.isfile(ts_backend_yaml), (
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@ -276,32 +276,30 @@ add_custom_command(
    WORKING_DIRECTORY
    "${TORCH_ROOT}"
 )
-if(USE_DISTRIBUTED)
-    if(WIN32)
-      append_filelist("libtorch_python_distributed_core_sources" TORCH_PYTHON_SRCS)
-    else()
-      append_filelist("libtorch_python_distributed_sources" TORCH_PYTHON_SRCS)
-    endif()
-    # Disable certain warnings for GCC-9.X
-    if(CMAKE_COMPILER_IS_GNUCXX)
-      set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
-      set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
-      set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
-    endif()
-    # NCCL is a private dependency of libtorch, but libtorch_python includes
-    # some private headers of libtorch, which in turn include NCCL. As a hacky
-    # alternative to making NCCL a public dependency of libtorch, we make it
-    # a private dependency of libtorch_python as well.
-    if(USE_NCCL)
-      list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl)
-    endif()
-    # Same for MPI.
-    if(USE_MPI)
-      list(APPEND TORCH_PYTHON_LINK_LIBRARIES MPI::MPI_CXX)
-    endif()
-    list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)

+if(WIN32)
+  append_filelist("libtorch_python_distributed_core_sources" TORCH_PYTHON_SRCS)
+else()
+  append_filelist("libtorch_python_distributed_sources" TORCH_PYTHON_SRCS)
 endif()
+# Disable certain warnings for GCC-9.X
+if(CMAKE_COMPILER_IS_GNUCXX)
+  set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
+  set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/rpc/testing/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
+  set_source_files_properties(${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp PROPERTIES COMPILE_FLAGS "-Wno-cast-function-type")
+endif()
+# NCCL is a private dependency of libtorch, but libtorch_python includes
+# some private headers of libtorch, which in turn include NCCL. As a hacky
+# alternative to making NCCL a public dependency of libtorch, we make it
+# a private dependency of libtorch_python as well.
+if(USE_NCCL)
+  list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl)
+endif()
+# Same for MPI.
+if(USE_MPI)
+  list(APPEND TORCH_PYTHON_LINK_LIBRARIES MPI::MPI_CXX)
+endif()
+list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)

 if(USE_NCCL AND NOT WIN32)
    list(APPEND TORCH_PYTHON_SRCS
@ -369,10 +367,6 @@ if(BUILD_LIBTORCHLESS)
    target_compile_definitions(torch_python PRIVATE USE_C10D_NCCL)
  endif()

-  if(USE_DISTRIBUTED)
-    target_compile_definitions(torch_python PRIVATE USE_DISTRIBUTED)
-  endif()
-
  if(USE_MPI AND USE_C10D_MPI)
    target_compile_definitions(torch_python PRIVATE USE_C10D_MPI)
  endif()
--- a/torch/_C/init.pyi.in
+++ b/torch/_C/init.pyi.in
@ -30,7 +30,6 @@ from torch._C import (
    _cpu,
    _dynamo,
    _export,
-    _functionalization,
    _functorch,
    _lazy,
    _lazy_ts_backend,
--- a/torch/_C/_autograd.pyi
+++ b/torch/_C/_autograd.pyi
@ -78,7 +78,6 @@ class _KinetoEvent:
    def privateuse1_elapsed_us(self) -> int: ...
    def is_user_annotation(self) -> bool: ...
    def is_hidden_event(self) -> bool: ...
-    def metadata_json(self) -> str: ...

 class _ProfilerResult:
    def events(self) -> list[_KinetoEvent]: ...
--- a/torch/_C/_distributed_c10d.pyi
+++ b/torch/_C/_distributed_c10d.pyi
@ -851,3 +851,12 @@ class ProcessGroupXCCL(Backend):

 def _set_process_group(pg: ProcessGroup) -> None: ...
 def _current_process_group() -> ProcessGroup: ...
+def _dump_nccl_trace_json(
+    includeCollectives: Optional[bool] = ...,
+    onlyActive: Optional[bool] = ...,
+) -> bytes: ...
+def _dump_nccl_trace(
+    includeCollectives: Optional[bool] = ...,
+    includeStackTraces: Optional[bool] = ...,
+    onlyActive: Optional[bool] = ...,
+) -> bytes: ...
--- a/torch/_C/_functionalization.pyi
+++ b/torch/_C/_functionalization.pyi
@ -1,16 +0,0 @@
-from torch import Tensor
-from torch.types import _bool
-
-# Defined in torch/csrc/functionalization/Module.cpp
-
-class ViewMeta:
-    has_symbolic_inputs: _bool
-
-# Returns the list of ViewMeta instances of the given functional tensor.
-#
-# Although we do have python bindings for their types, we won't
-# expose them here, since they should not be used by users.
-def get_view_meta_sequence(tensor: Tensor) -> list[ViewMeta]: ...
-
-# Applies the ViewMeta sequence on top of the given base.
-def apply_view_meta_sequence(base: Tensor, sequence: list[ViewMeta]) -> Tensor: ...
--- a/torch/_dynamo/trace_rules.py
+++ b/torch/_dynamo/trace_rules.py
@ -51,7 +51,6 @@ from .resume_execution import TORCH_DYNAMO_RESUME_IN_PREFIX
 from .utils import (
    getfile,
    hashable,
-    is_annotate_wrapped_function,
    is_lru_cache_wrapped_function,
    NP_SUPPORTED_MODULES,
    unwrap_if_wrapper,
@ -155,7 +154,6 @@ manual_torch_name_rule_map: dict[
        type[UserFunctionVariable],
    ],
 ] = {
-    "torch.fx.traceback.annotate": UserFunctionVariable,
    "torch.onnx.is_in_onnx_export": TorchInGraphFunctionVariable,
    "torch.onnx.operators.shape_as_tensor": TorchInGraphFunctionVariable,
    "torch.overrides.is_tensor_like": TorchInGraphFunctionVariable,
@ -3004,8 +3002,6 @@ def get_torch_obj_rule_map() -> dict[Any, type["VariableTracker"]]:
                    continue
                obj = torch_dir + k[len("torch/") :]
            if obj is not None:
-                if is_annotate_wrapped_function(obj):
-                    obj = obj.__wrapped__
                if is_lru_cache_wrapped_function(obj):
                    obj = obj.__wrapped__
                if obj in d and d[obj] != v:
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@ -1101,14 +1101,6 @@ def is_lru_cache_wrapped_function(
    )


-def is_annotate_wrapped_function(
-    value: Any,
-) -> bool:
-    return value == torch.fx.traceback.annotate and is_function(
-        inspect.getattr_static(value, "__wrapped__")
-    )
-
-
 _FuncTypes: TypeAlias = Union[
    types.FunctionType,
    types.BuiltinFunctionType,
--- a/torch/_functorch/_aot_autograd/autograd_cache.py
+++ b/torch/_functorch/_aot_autograd/autograd_cache.py
@ -284,6 +284,19 @@ def check_cacheable(gm: torch.fx.GraphModule):
        check_cacheable(gm.saved_tensors_hooks_unpack_0)  # type: ignore[arg-type]


+def check_metadata_cacheable(metadata: ViewAndMutationMeta):
+    """
+    When view replay is turned on, we bypass autograd cache if
+    the output is aliased.
+    """
+    if config.view_replay_for_aliased_outputs:
+        for info in metadata.output_info:
+            if info.functional_tensor is not None:
+                raise BypassAOTAutogradCache(
+                    "Cannot cache a graph with functional tensor"
+                )
+
+
 class AOTAutogradCacheDetails(FxGraphHashDetails):
    """
    Object to capture all the details for a dynamo graph module relevant to computing
@ -790,6 +803,7 @@ class GenericAOTAutogradCacheEntry(Generic[TForward, TBackward]):
        """
        Perform any preparations to make the cache entry ready for serialization.
        """
+        check_metadata_cacheable(self.runtime_metadata)
        self.compiled_fw.pre_save()
        if self.compiled_bw is not None:
            self.compiled_bw.pre_save()
--- a/torch/_functorch/_aot_autograd/collect_metadata_analysis.py
+++ b/torch/_functorch/_aot_autograd/collect_metadata_analysis.py
@ -43,10 +43,10 @@ from .functional_utils import (
    has_metadata_mutation,
    MetadataKey,
    to_fun,
-    ViewMetaSequence,
    was_inductor_storage_resized,
 )
 from .schemas import (
+    FunctionalTensorMetadataEq,
    InputAliasInfo,
    MemoryFormatMeta,
    MutationType,
@ -640,7 +640,7 @@ from a multi-output view call"
            #
            # The FunctionalTensor will be saved if one of the 2 conditions below
            # is true:
-            view_meta_sequence = None
+            functional_tensor = None
            if (
                # 1. If the output_type is either of:
                #    (i) alias_of_intermediate;
@ -672,7 +672,7 @@ from a multi-output view call"
                and not input_info[base_idx].mutates_metadata
            ):
                if isinstance(o, FunctionalTensor):
-                    view_meta_sequence = ViewMetaSequence(o)
+                    functional_tensor = FunctionalTensorMetadataEq(o.elem)

            out_info = OutputAliasInfo(
                output_type=output_type,
@ -680,7 +680,7 @@ from a multi-output view call"
                base_idx=base_idx,
                dynamic_dims=dynamic_dims,
                requires_grad=isinstance(o, torch.Tensor) and o.requires_grad,
-                view_meta_sequence=view_meta_sequence,
+                functional_tensor=functional_tensor,
            )
            output_info.append(out_info)

--- a/torch/_functorch/_aot_autograd/functional_utils.py
+++ b/torch/_functorch/_aot_autograd/functional_utils.py
@ -14,7 +14,6 @@ from typing import Optional

 import torch
 from torch import Tensor
-from torch._C import _functionalization
 from torch._logging import getArtifactLogger
 from torch._subclasses.fake_tensor import FakeTensor
 from torch._subclasses.functional_tensor import FunctionalTensor
@ -225,9 +224,9 @@ def gen_alias_from_base(
    aliased_base_tensor,
    target_meta_tensor,
    target_requires_grad,
-    target_view_meta_sequence: Optional[ViewMetaSequence] = None,
+    target_functional_tensor: Optional[FunctionalTensorMetadataEq] = None,
    *,
-    replay_views: bool,
+    replay_views,
 ):
    # Patch the correct requires_grad field of the output tensor, depending on whether:
    # (i) the reconstructed output (out) was came from a tensor that requires grad or not;
@ -246,11 +245,13 @@ def gen_alias_from_base(
    # to replay them (view functions) on the aliased_base_tensor.
    if (
        replay_views
-        and target_view_meta_sequence is not None
-        and not any(vm.has_symbolic_inputs for vm in target_view_meta_sequence.sequence)
+        and target_functional_tensor is not None
+        and not torch._functionalize_is_symbolic(target_functional_tensor.tensor)
    ):
-        out = _functionalization.apply_view_meta_sequence(
-            aliased_base_tensor, target_view_meta_sequence.sequence
+        functional_tensor = target_functional_tensor.tensor
+
+        out = torch._functionalize_apply_view_metas(
+            functional_tensor, aliased_base_tensor
        )
        # If re-applying the ViewMeta sequence succeeded, there should be no more
        # problems going forward. We just check we got to the target shape and
@ -356,45 +357,25 @@ class MetadataKey:
        )


-# ViewMeta sequence wrapper for equality comparisons.
-#
-# Even though we can compare each ViewMeta instance, we compare the resulting
-# tensor metadata, instead. That's because the creation of synthetic bases + the
-# re-generation of input views might end-up creating a different sequence of
-# ViewMeta that is semantically equivalent. i.e. gets to a tensor with the same
-# metadata.
-#
-# Therefore, we store what the end result should look like as serializable
-# metadata.
-#
-# When logging, this class should look like:
-#
-#     ViewMetaSequence(view, select_int, slice_Tensor)
-#
-# i.e. a parenthesized list of view operations within that ViewMeta sequence.
-class ViewMetaSequence:
-    def __init__(self, tensor: FunctionalTensor) -> None:
-        assert torch._is_functional_tensor(tensor.elem)
-        self.sequence = _functionalization.get_view_meta_sequence(tensor.elem)
-        self.metadata = MetadataKey.make(tensor)
-
-    def __repr__(self) -> str:
-        suffix = len("_ViewMeta")
-        types = ", ".join(type(vm).__name__[:-suffix] for vm in self.sequence)
-        return f"ViewMetaSequence({types})"
+# Wrapper around a FunctionalTensorWrapper for comparing only the resulting metadata
+# after applying all the ViewMeta operations.
+class FunctionalTensorMetadataEq:
+    def __init__(self, tensor: torch.Tensor) -> None:
+        assert torch._is_functional_tensor(tensor)
+        self.tensor = tensor

    def __eq__(self, other: object) -> bool:
        # If other is None, then it probably means that we weren't able to recreate
-        # the ViewMeta sequence. One example is when we update the view metadata by
-        # calling: create_synthetic_base_metadata.
+        # the FunctionalTensorMetadataEq. One of this cases is when we update the
+        # view metadata by calling: create_synthetic_base_metadata.
        if other is None:
            return True

        # Comparison against any other type is not implemented.
-        if not isinstance(other, ViewMetaSequence):
+        if not isinstance(other, FunctionalTensorMetadataEq):
            return NotImplemented

-        return self.metadata == other.metadata
+        return has_same_metadata(self.tensor, other.tensor)


 # new_arg and arg here are either:
--- a/torch/_functorch/_aot_autograd/input_output_analysis.py
+++ b/torch/_functorch/_aot_autograd/input_output_analysis.py
@ -89,7 +89,7 @@ def remove_dupe_metadata(
                dynamic_dims=o.dynamic_dims,
                base_idx=None if o.base_idx is None else add_dupe_map[o.base_idx],
                requires_grad=o.requires_grad,
-                view_meta_sequence=o.view_meta_sequence,
+                functional_tensor=o.functional_tensor,
            )
            for o in m.output_info
        ],
@ -242,7 +242,7 @@ def create_synthetic_base_metadata(
                # Map the input idx pre-synthetic-bases to the new idx post-synthetic-bases
                base_idx=new_base_idx,  # type: ignore[arg-type]
                requires_grad=o.requires_grad,
-                view_meta_sequence=o.view_meta_sequence,
+                functional_tensor=o.functional_tensor,
            )
        )

--- a/torch/_functorch/_aot_autograd/runtime_wrappers.py
+++ b/torch/_functorch/_aot_autograd/runtime_wrappers.py
@ -150,7 +150,7 @@ class AliasOfInputHandler:
        self.base_idx = info.base_idx
        self.unwrap_out = _unwrap_tensoralias if trace_joint else _identity
        self.requires_grad = info.requires_grad
-        self.view_meta_sequence = info.view_meta_sequence
+        self.functional_tensor = info.functional_tensor
        self.replay_views = config.view_replay_for_aliased_outputs

    def __call__(self, orig_inputs, fw_outs, out):
@ -159,7 +159,7 @@ class AliasOfInputHandler:
            aliased_base_tensor,
            self.unwrap_out(out),
            self.requires_grad,
-            self.view_meta_sequence,
+            self.functional_tensor,
            replay_views=self.replay_views,
        )

@ -190,7 +190,7 @@ class AliasOfIntermediateHandler:

        self.unwrap_out = _unwrap_tensoralias if trace_joint else _identity
        self.requires_grad = info.requires_grad
-        self.view_meta_sequence = info.view_meta_sequence
+        self.functional_tensor = info.functional_tensor
        self.replay_views = config.view_replay_for_aliased_outputs

    def __call__(self, orig_inputs, fw_outs, out):
@ -199,7 +199,7 @@ class AliasOfIntermediateHandler:
            self._unwrap_aliased_base_tensor(aliased_base_tensor),
            self.unwrap_out(out),
            self.requires_grad,
-            self.view_meta_sequence,
+            self.functional_tensor,
            replay_views=self.replay_views,
        )

--- a/torch/_functorch/_aot_autograd/schemas.py
+++ b/torch/_functorch/_aot_autograd/schemas.py
@ -7,6 +7,7 @@ input/output types, metadata, config, function signatures etc.
 from __future__ import annotations

 import collections
+import dataclasses
 import functools
 import itertools
 from dataclasses import dataclass, field
@ -31,7 +32,10 @@ from torch.fx.experimental._backward_state import BackwardState
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass

 from .. import config
-from .functional_utils import _check_if_mutation_can_be_in_graph, ViewMetaSequence
+from .functional_utils import (
+    _check_if_mutation_can_be_in_graph,
+    FunctionalTensorMetadataEq,
+)
 from .utils import strict_zip


@ -113,14 +117,15 @@ class OutputAliasInfo:
    dynamic_dims: Optional[set[int]]
    # requires_grad
    requires_grad: bool
-    # Sequence of ViewMeta objects.
+    # FunctionalTensorWrapper that represents this output.
    #
-    # Provides us the means to re-run view functions on other tensors.
+    # Provides us the means to replay views from it.
    #
-    # We need to wrap the actual list of ViewMeta with this class so that
-    # we compare the ViewMeta elements appropriately, i.e. their type and
-    # the elements returned by the `as_tuple()` call.
-    view_meta_sequence: Optional[ViewMetaSequence] = None
+    # We need to wrap the actual FunctionalTensorWrapper with this class so that
+    # we only compare the tensor's metadata. That's because with the transformations
+    # of the model throughout AOTAutograd, the sequence of ViewMeta and the base
+    # tensor might change.
+    functional_tensor: Optional[FunctionalTensorMetadataEq] = None


 class MutationType(Enum):
@ -660,6 +665,17 @@ class ViewAndMutationMeta:
        self.traced_tangent_metas = [extract_metadata(t) for t in self.traced_tangents]
        # Clear traced tangents at runtime
        self.traced_tangents = []
+        new_output_info = []
+        for out in self.output_info:
+            if config.view_replay_for_aliased_outputs:
+                new_out = out
+            else:
+                # If we're not using view_replay, remove the functional tensor.
+                # Functional tensors are unfortunately not serializable,
+                # so doing this is required for AOTAutograd caching.
+                new_out = dataclasses.replace(out, functional_tensor=None)
+            new_output_info.append(new_out)
+        self.output_info = new_output_info
        for inp_meta in self.subclass_inp_meta:
            if isinstance(inp_meta, SubclassCreationMeta):
                inp_meta.make_runtime_safe()
--- a/torch/_inductor/codegen/wrapper_fxir.py
+++ b/torch/_inductor/codegen/wrapper_fxir.py
@ -23,11 +23,7 @@ from torch._higher_order_ops.triton_kernel_wrap import (
 from torch._inductor.codecache import LambdaFuture, PyCodeCache
 from torch._inductor.runtime.triton_heuristics import CachingAutotuner
 from torch._inductor.select_algorithm import extern_kernels  # noqa: F401
-from torch._inductor.utils import (
-    convert_shape_to_symint,
-    convert_to_symint,
-    sympy_product,
-)
+from torch._inductor.utils import convert_shape_to_symint, convert_to_symint
 from torch._inductor.virtualized import V
 from torch._library.triton import wrap_triton
 from torch.fx import GraphModule
@ -120,30 +116,20 @@ def replace_floor_div(expr: sympy.Expr) -> sympy.Expr:
    def replace(expr: sympy.Expr) -> sympy.Expr:
        expr = sympy.together(expr)

-        # Find division operations in the sympy.floor expression
-        # Div is either represented as Mul with:
-        # Rational denominator or Pow with negative exponent
-        if not isinstance(expr, sympy.core.mul.Mul):
-            return sympy.floor(expr)
-
-        if isinstance(expr.args[0], sympy.Rational):
-            frac = expr.args[0]
-            numerator = sympy_product(expr.args[1:]) * frac.numerator
-            denominator = frac.denominator
-
-            return FloorDiv(numerator, denominator)
-        elif isinstance(expr.args[0], sympy.Pow):
-            base = expr.args[0].base
-            exp = expr.args[0].exp
-            numerator = sympy_product(expr.args[1:])
-            if exp < 0:
-                denominator = base ** (-exp)
+        # Division is represented as a Mul with a Rational factor or a Pow with negative
+        # exponent. We convert floor(Mul(...)) to FloorDiv(numerator, denominator) by
+        # partitioning factors into the numerator and denominator.
+        (numerator, denominator) = (sympy.S.One,) * 2
+        for arg in sympy.Mul.make_args(expr):
+            if isinstance(arg, sympy.Rational):
+                numerator *= arg.numerator
+                denominator *= arg.denominator
+            elif isinstance(arg, sympy.Pow) and arg.exp.is_negative:
+                denominator *= arg.base**-arg.exp
            else:
-                numerator = numerator * (base**exp)
-                denominator = 1
-            return FloorDiv(numerator, denominator)
-        else:
-            return sympy.floor(expr)
+                numerator *= arg
+
+        return FloorDiv(numerator, denominator)

    return expr.replace(sympy.floor, replace)

@ -930,10 +916,6 @@ class FxConverter:
        call_args = self._lookup_args(line.call_args)
        kernel = self.kernels[line.kernel_name]
        tuner = kernel.tuner
-        # Use python_slow mode instead of python mode to avoid
-        # the round to neginf behaviour, which is not the convention
-        # in other languages.
-        tuner.grid_mode = "python_slow"

        # Optionally autotune the kernels.
        # The FX backend currently only supports compile-time tuning.
@ -1007,8 +989,7 @@ class FxConverter:
        call_kwargs = dict(zip(signature, call_args))
        call_kwargs.update(kernel_config.kwargs)

-        # Replace all sympy.floor with FloorDiv
-        # _generate_sym_node does not support sympy.floor
+        # Replace sympy.floor with FloorDiv, to make the expression traceable.
        grid = [replace_floor_div(x) if isinstance(x, sympy.Expr) else x for x in grid]
        wrapper_grid = [tuple(self._generate_sym_nodes(grid))]
        call_kwargs = {
--- a/torch/_inductor/fx_passes/micro_pipeline_tp.py
+++ b/torch/_inductor/fx_passes/micro_pipeline_tp.py
@ -880,14 +880,6 @@ def fuse_matmul_reduce_scatter(reduce_scatter: _ReduceScatterMatch) -> None:
    if not is_symm_mem_enabled_for_group(group_name):
        return

-    filter_matmul = None
-    if orig_scatter_dim == _get_tensor(input_node).ndim - 1:
-        # scaled_mm is not supported yet for last dim mm+rs
-        def _filter_out_scaled_matmul(matmul: _Matmul):
-            return not isinstance(matmul, _ScaledMatmul)
-
-        filter_matmul = _filter_out_scaled_matmul
-
    # Currently fused_matmul_reduce_scatter doesn't return the matmul result,
    # so we can't apply the fusion if the matmul result is used by multiple
    # users. This is not a fundamental limitation of the fused op and can be
@ -899,16 +891,12 @@ def fuse_matmul_reduce_scatter(reduce_scatter: _ReduceScatterMatch) -> None:
        return

    matmul = _find_producer_matmul(input_node)
-
    if matmul is None:
        log.warning(
            "no producer matmul found for reduce scatter, skipping fuse_matmul_reduce_scatter fusion"
        )
        return

-    if filter_matmul and not filter_matmul(matmul):
-        return
-
    if rs_wait_tensor_node in matmul.arg_ancestor_nodes:
        log.warning(
            "reduce-scatter result node is an ancestor of matmul, skipping fuse_matmul_reduce_scatter fusion"
--- a/torch/_inductor/runtime/triton_heuristics.py
+++ b/torch/_inductor/runtime/triton_heuristics.py
@ -375,7 +375,7 @@ class CachingAutotuner(KernelInterface):
        self.is_backward = False

        # Mode for launch grid calculation
-        self.grid_mode: Literal["python", "python_slow", "cpp"] = "python"
+        self.grid_mode: Literal["python", "cpp"] = "python"

    def is_statically_launchable(self):
        """
@ -3192,14 +3192,14 @@ class GridExpr:
    """Generate code for grid size expressions in launcher"""

    inductor_meta: dict[str, Any]
-    mode: Literal["python", "cpp", "python_slow"] = "python"
+    mode: Literal["python", "cpp"] = "python"
    prefix: list[str] = dataclasses.field(default_factory=list)
    x_grid: Union[str, int] = 1
    y_grid: Union[str, int] = 1
    z_grid: Union[str, int] = 1

    def __post_init__(self) -> None:
-        assert self.mode in ("python", "cpp", "python_slow")
+        assert self.mode in ("python", "cpp")

    def generate(self, meta: dict[str, int]) -> None:
        raise NotImplementedError
@ -3215,10 +3215,6 @@ class GridExpr:
        # negative integer division is floored
        if self.mode == "python":
            return f"-(({numel}) // -({block}))"
-        # This is more generic than above, and works in languages where
-        # positive integer division is floored/truncated
-        elif self.mode == "python_slow":
-            return f"(({numel} + {block} - 1) // ({block}))"
        # For cpp code gen
        return f"(({numel} + ({block} - 1)) / ({block}))"

@ -3227,7 +3223,7 @@ class GridExpr:
        items = self._constant_fold(max, seq)
        if len(items) <= 1:
            return items[0]
-        if self.mode in ("python", "python_slow"):
+        if self.mode == "python":
            return f"max({', '.join(map(str, items))})"
        return functools.reduce(lambda x, y: f"std::max({x}, {y})", items)

@ -3250,7 +3246,7 @@ class GridExpr:

    def assign_tmp(self, name: str, expr: Union[str, int]) -> str:
        # Grid functions are one per kernel, so name collisions are fine
-        if self.mode in ("python", "python_slow"):
+        if self.mode == "python":
            return f"{name} = {expr}"
        if self.mode == "cpp":
            return f"uint32_t {name} = {expr};"
@ -3260,7 +3256,7 @@ class GridExpr:
    def from_meta(
        inductor_meta: dict[str, Any],
        cfg: Union[Config, dict[str, int]],
-        mode: Literal["python", "cpp", "python_slow"] = "python",
+        mode: Literal["python", "cpp"] = "python",
    ) -> GridExpr:
        grid_cls = globals()[inductor_meta["grid_type"]]
        assert issubclass(grid_cls, GridExpr)
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@ -638,7 +638,6 @@ class profile:
                device_resource_id=kineto_event.device_resource_id(),
                flops=kineto_event.flops(),
                is_user_annotation=kineto_event.is_user_annotation(),
-                metadata_json=kineto_event.metadata_json(),
            )
            max_evt_id = max(max_evt_id, fe.id)
            if fe.device_type == DeviceType.CPU and not fe.is_async:
--- a/torch/autograd/profiler_util.py
+++ b/torch/autograd/profiler_util.py
@ -491,7 +491,6 @@ class FunctionEvent(FormattedTimesMixin):
        concrete_inputs=None,
        kwinputs=None,
        is_user_annotation=False,
-        metadata_json=None,
    ):
        self.id: int = id
        self.node_id: int = node_id
@ -527,7 +526,6 @@ class FunctionEvent(FormattedTimesMixin):
        self.self_cpu_percent = -1
        self.total_cpu_percent = -1
        self.total_device_percent = -1
-        self.metadata_json = metadata_json

    def append_kernel(self, name, device, duration):
        assert self.device_type == DeviceType.CPU
--- a/torch/csrc/Exceptions.h
+++ b/torch/csrc/Exceptions.h
@ -15,9 +15,7 @@
 #include <torch/csrc/utils/cpp_stacktraces.h>
 #include <torch/csrc/utils/pybind.h>

-#if defined(USE_DISTRIBUTED)
 #include <torch/csrc/distributed/c10d/exception.h>
-#endif

 inline void PyErr_SetString(PyObject* type, const std::string& message) {
  PyErr_SetString(type, message.c_str());
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@ -72,7 +72,6 @@
 #include <torch/csrc/cpu/Module.h>
 #include <torch/csrc/dynamo/init.h>
 #include <torch/csrc/export/pybind.h>
-#include <torch/csrc/functionalization/Module.h>
 #include <torch/csrc/functorch/init.h>
 #include <torch/csrc/fx/node.h>
 #include <torch/csrc/inductor/aoti_package/pybind.h>
@ -122,14 +121,10 @@
 #endif
 #endif

-#ifdef USE_DISTRIBUTED
-#ifdef USE_C10D
 #include <torch/csrc/distributed/autograd/python_autograd.h>
 #include <torch/csrc/distributed/c10d/c10d.h>
 #include <torch/csrc/distributed/rpc/rpc.h>
 #include <torch/csrc/distributed/rpc/testing/testing.h>
-#endif
-#endif

 #if defined(USE_VALGRIND)
 #include <callgrind.h>
@ -409,9 +404,11 @@ static PyObject* THPModule_swap_tensor_impl(PyObject* _unused, PyObject* args) {
  // The TensorImpls contain PyObjectSlots that have a reference to the PyObject
  // associated with the TensorImpl. Swap this field as well.
  std::optional<PyObject*> mb_obj_a =
-      a->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj();
+      a->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
+          /*ignore_hermetic_tls=*/false);
  std::optional<PyObject*> mb_obj_b =
-      b->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj();
+      b->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
+          /*ignore_hermetic_tls=*/false);
  TORCH_INTERNAL_ASSERT(
      mb_obj_a.has_value() && mb_obj_b.has_value(),
      "Both tensors should have PyObjects tagged by the current python interpreter");
@ -552,11 +549,7 @@ static PyObject* THPModule_getBackcompatKeepdimWarn(
 }

 static PyObject* THPModule_hasDistributed(PyObject* _unused, PyObject* noargs) {
-#ifdef USE_DISTRIBUTED
  Py_RETURN_TRUE;
-#else
-  Py_RETURN_FALSE;
-#endif
 }

 static PyObject* THPModule_showConfig(PyObject* module, PyObject* noargs) {
@ -2008,7 +2001,6 @@ PyObject* initModule() {
 #ifdef USE_XPU
  THPUtils_addPyMethodDefs(methods, THXPModule_methods());
 #endif
-#if defined(USE_DISTRIBUTED) && defined(USE_C10D)
  THPUtils_addPyMethodDefs(
      methods, torch::distributed::c10d::python_functions());
 #ifndef _WIN32
@ -2018,7 +2010,6 @@ PyObject* initModule() {
      methods, torch::distributed::autograd::python_functions());
  THPUtils_addPyMethodDefs(
      methods, torch::distributed::rpc::testing::python_functions());
-#endif
 #endif

  static struct PyModuleDef torchmodule = {
@ -2091,7 +2082,6 @@ PyObject* initModule() {
  torch::instruction_counter::initModule(module);
  torch::initVerboseBindings(module);
  ASSERT_TRUE(THPStorage_init(module));
-  torch::functionalization::initModule(module);

 #ifdef USE_CUDA
  // This will only initialise base classes and attach them to library namespace
--- a/torch/csrc/PyInterpreter.cpp
+++ b/torch/csrc/PyInterpreter.cpp
@ -614,7 +614,8 @@ static void set_tensor_attr_with_capsule(
    const c10::TensorImpl* tensor,
    py::capsule& capsule,
    const char* attr_name) {
-  std::optional<PyObject*> mb_obj = tensor->pyobj_slot()->check_pyobj();
+  std::optional<PyObject*> mb_obj = tensor->pyobj_slot()->check_pyobj(
+      /*ignore_hermetic_tls=*/false);
  TORCH_CHECK(
      mb_obj.has_value(), "Tensor subclass's PyInterpreter has no value");
  auto obj = mb_obj.value();
@ -641,7 +642,8 @@ static c10::ArrayRef<T> get_set_cached_attr(
    const c10::TensorImpl* tensor,
    const char* base_attr_name,
    const py::object& obj) {
-  std::optional<PyObject*> mb_obj = tensor->pyobj_slot()->check_pyobj();
+  std::optional<PyObject*> mb_obj =
+      tensor->pyobj_slot()->check_pyobj(getPyInterpreter());
  TORCH_CHECK(
      mb_obj.has_value(), "Tensor subclass's PyInterpreter has no value");
  auto tensor_obj = mb_obj.value();
--- a/torch/csrc/Storage.cpp
+++ b/torch/csrc/Storage.cpp
@ -41,8 +41,8 @@ PyObject* THPStorage_NewWithStorage(
      "Creating a Storage subclass from a class that does not inherit from ",
      "Storage is not possible. Make sure your class inherits from Storage.");

-  auto maybe_pyobj =
-      _storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj();
+  auto maybe_pyobj = _storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj(
+      /*ignore_hermetic_tls=*/false);
  if (maybe_pyobj.has_value() && maybe_pyobj.value()) {
    TORCH_CHECK(
        allow_preexisting_pyobj,
@ -93,7 +93,8 @@ PyObject* THPStorage_Wrap(c10::Storage storage) {
  }
  c10::impl::PyObjectSlot* pyobj_slot = storage_impl->pyobj_slot();

-  std::optional<PyObject*> maybe_pyobj = pyobj_slot->check_pyobj();
+  std::optional<PyObject*> maybe_pyobj = pyobj_slot->check_pyobj(
+      /*ignore_hermetic_tls=*/false);
  if (maybe_pyobj.has_value()) {
    auto obj = *maybe_pyobj;
    if (obj) {
@ -126,8 +127,8 @@ static bool THPStorage_isPreservable(THPStorage* self) {
    return false;
  }

-  if (storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj() !=
-      (PyObject*)self) {
+  if (storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj(
+          /*ignore_hermetic_tls=*/true) != (PyObject*)self) {
    return false;
  }
  if (storage.use_count() <= 1) {
@ -144,7 +145,8 @@ static bool THPStorage_tryPreserve(THPStorage* self) {
  const auto& storage = THPStorage_Unpack(self);
  c10::StorageImpl* storage_impl = storage.unsafeGetStorageImpl();

-  auto maybe_pyobj = storage_impl->pyobj_slot()->check_pyobj();
+  auto maybe_pyobj = storage_impl->pyobj_slot()->check_pyobj(
+      /*ignore_hermetic_tls=*/true);
  // NOTE: It is possible to just set the PyObjectSlot here, but the point is
  // that we should have already set PyObjectSlot when the storage PyObject
  // was created.
--- a/torch/csrc/autograd/TraceTypeManual.cpp
+++ b/torch/csrc/autograd/TraceTypeManual.cpp
@ -245,12 +245,13 @@ static void general_trace_function(
          tracer::addInputs(
              node, args[i].name().c_str(), iter->toBoolList().vec());
        } else {
-          TORCH_CHECK(false, "unsupported input list type: ", elem_type->str());
+          throw std::runtime_error(
+              "unsupported input list type: " + elem_type->str());
        }
      } else if (iter->isObject()) {
        tracer::addInputs(node, args[i].name().c_str(), iter->toObject());
      } else {
-        TORCH_CHECK(false, "unsupported input type: ", type->str());
+        throw std::runtime_error("unsupported input type: " + type->str());
      }
    }
    graph->insertNode(node);
@ -276,19 +277,16 @@ static void general_trace_function(
          AT_ASSERT(iter->isTensorList());
          tracer::addOutput(node, iter->toTensorList());
        } else {
-          TORCH_CHECK(
-              false, "unsupported output list type: ", elem_type->str());
+          throw std::runtime_error(
+              "unsupported output list type: " + elem_type->str());
        }
      } else if (type->kind() == TypeKind::ClassType) {
        AT_ASSERT(iter->isObject());
        tracer::addOutput(node, iter->toObject());
      } else {
-        TORCH_CHECK(
-            false,
-            "unsupported output type: ",
-            type->str(),
-            ", from operator: ",
-            toString(op.operator_name()));
+        throw std::runtime_error(
+            "unsupported output type: " + type->str() +
+            ", from operator: " + toString(op.operator_name()));
      }
    }
  }
--- a/torch/csrc/autograd/cpp_hook.cpp
+++ b/torch/csrc/autograd/cpp_hook.cpp
@ -11,8 +11,10 @@ void check_single_result(
    const at::TensorBase& value,
    const at::TensorBase& result,
    const std::string& hook_name) {
-  TORCH_CHECK(
-      value.defined(), "can't replace a empty gradient with a non-empty value");
+  if (!value.defined()) {
+    throw std::runtime_error(
+        "can't replace a empty gradient with a non-empty value");
+  }
  torch::autograd::check_variable_result(value, result, hook_name);
 }
 } // namespace
--- a/torch/csrc/autograd/custom_function.cpp
+++ b/torch/csrc/autograd/custom_function.cpp
@ -482,31 +482,30 @@ void check_variable_result(
    const at::TensorBase& original,
    const at::TensorBase& result,
    const std::string& hook_name) {
-  TORCH_CHECK(
-      original.options().type_equal(result.options()),
-      "hook '",
-      hook_name,
-      "' has changed the type of value (was ",
-      original.toString(),
-      " got ",
-      result.toString(),
-      ")");
+  if (!original.options().type_equal(result.options())) {
+    std::stringstream ss;
+    ss << "hook '" << hook_name << "' has changed the type of value (";
+    ss << "was " << original.toString() << " got ";
+    ss << result.toString() << ")";
+    throw std::runtime_error(ss.str());
+  }

-  TORCH_CHECK(
-      original.is_cuda() == result.is_cuda(),
-      "hook '",
-      hook_name,
-      "' has changed the type of value (was ",
-      original.is_cuda() ? "CUDA tensor" : "CPU tensor",
-      " got ",
-      result.is_cuda() ? "CUDA tensor" : "CPU tensor",
-      ")");
+  if (original.is_cuda() != result.is_cuda()) {
+    std::stringstream ss;
+    ss << "hook '" << hook_name << "' has changed the type of value";
+    if (original.is_cuda()) {
+      ss << " (was CUDA tensor got CPU tensor)";
+    } else {
+      ss << " (was CPU tensor got CUDA tensor)";
+    }
+    throw std::runtime_error(ss.str());
+  }

-  TORCH_CHECK(
-      original.sym_sizes().vec() == result.sym_sizes().vec(),
-      "hook '",
-      hook_name,
-      "' has changed the size of value");
+  if (original.sym_sizes().vec() != result.sym_sizes().vec()) {
+    std::stringstream ss;
+    ss << "hook '" << hook_name << "' has changed the size of value";
+    throw std::runtime_error(ss.str());
+  }
 }

 AutogradContext::AutogradContext(PackedArgs& packed_args) {
--- a/torch/csrc/autograd/custom_function.h
+++ b/torch/csrc/autograd/custom_function.h
@ -228,32 +228,30 @@ inline variable_list CppNode_apply_functional(
    }
  }

-  TORCH_CHECK(
-      num_outputs == num_forward_inputs,
-      "function ",
-      name,
-      " returned an incorrect number of gradients (expected ",
-      num_forward_inputs,
-      ", got ",
-      num_outputs,
-      ")");
+  if (num_outputs != num_forward_inputs) {
+    std::string msg("function ");
+    msg += name + " returned an incorrect number of gradients (expected ";
+    msg += std::to_string(num_forward_inputs) + ", got ";
+    msg += std::to_string(num_outputs) + ")";
+    throw std::runtime_error(msg);
+  }

  variable_list results;
  results.reserve(num_outputs);
  for (const auto i : c10::irange(num_outputs)) {
    if (!is_variable_input_[i]) {
-      TORCH_CHECK(
-          outputs[i].defined() == false,
-          "function ",
-          name,
-          " returned a gradient different that is defined at position ",
-          i + 1,
-          ", std the corresponding forward input was not a Variable");
+      if (outputs[i].defined()) {
+        std::string msg("function ");
+        msg += name +
+            " returned a gradient different that is defined at position ";
+        msg += std::to_string(i + 1) +
+            ", std the corresponding forward input was not a Variable";
+        throw std::runtime_error(msg);
+      }
      continue;
    }
    results.emplace_back(outputs[i]);
  }
-
  return results;
 }

--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@ -707,8 +707,9 @@ void GraphTask::mark_as_completed_and_run_post_processing() {
 }

 void GraphTask::exec_post_processing() {
-  TORCH_CHECK(
-      not_ready_.empty(), "could not compute gradients for some functions");
+  if (!not_ready_.empty()) {
+    throw std::runtime_error("could not compute gradients for some functions");
+  }

  // set the thread_local current_graph_task_ as more callbacks can be installed
  // by existing final callbacks.
@ -1148,13 +1149,12 @@ void Engine::evaluate_function(
    for (const auto i : c10::irange(num_outputs)) {
      auto& output = outputs[i];
      at::OptionalDeviceGuard guard(device_of(output));
-      TORCH_CHECK(
-          !output.defined() || !isnan(output)._is_any_true().item<bool>(),
-          "Function '",
-          fn.name(),
-          "' returned nan values in its ",
-          i,
-          "th output.");
+      if (output.defined() && isnan(output)._is_any_true().item<bool>()) {
+        std::stringstream ss;
+        ss << "Function '" << fn.name() << "' returned nan values in its " << i
+           << "th output.";
+        throw std::runtime_error(ss.str());
+      }
    }
  }

@ -1175,7 +1175,7 @@ void Engine::evaluate_function(

    if (it == dependencies.end()) {
      auto name = next.function->name();
-      TORCH_CHECK(false, "dependency not found for ", name);
+      throw std::runtime_error(std::string("dependency not found for ") + name);
    } else if (--it->second == 0) {
      dependencies.erase(it);
      is_ready = true;
--- a/torch/csrc/autograd/functions/basic_ops.cpp
+++ b/torch/csrc/autograd/functions/basic_ops.cpp
@ -17,7 +17,7 @@ variable_list Error::apply(variable_list&& inputs) {
 }

 variable_list Error::apply(variable_list&& inputs) const {
-  TORCH_CHECK(false, msg);
+  throw std::runtime_error(msg);
 }

 void Error::compiled_args(CompiledNodeArgs& args) const {
--- a/torch/csrc/autograd/functions/init.cpp
+++ b/torch/csrc/autograd/functions/init.cpp
@ -8,9 +8,7 @@
 #include <torch/csrc/autograd/python_autograd.h>
 #include <torch/csrc/autograd/python_cpp_function.h>
 #include <torch/csrc/autograd/python_variable.h>
-#ifdef USE_DISTRIBUTED
 #include <torch/csrc/distributed/autograd/functions/sendrpc_backward.h>
-#endif
 #include <torch/csrc/jit/python/python_tracer.h>
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/utils/python_numbers.h>
@ -49,7 +47,7 @@ struct UndefinedGradCtor {

 struct NoCtor {
  Node* operator()(PyObject* args) {
-    TORCH_CHECK(false, "Cannot construct");
+    throw std::runtime_error("Cannot construct");
  }
 };

@ -150,11 +148,9 @@ void THPAutograd_initFunctions() {
  static PyTypeObject CopyBackwardsClass;
  addClass<CopyBackwards, NoCtor>(module, CopyBackwardsClass, "CopyBackwards");

-#ifdef USE_DISTRIBUTED
  static PyTypeObject SendRpcBackwardClass;
  addClass<torch::distributed::autograd::SendRpcBackward, NoCtor>(
      module, SendRpcBackwardClass, "SendRpcBackward");
-#endif

  static PyTypeObject CopySlicesClass;
  addClass<CopySlices, NoCtor>(module, CopySlicesClass, "CopySlices");
--- a/torch/csrc/autograd/functions/tensor.cpp
+++ b/torch/csrc/autograd/functions/tensor.cpp
@ -184,7 +184,9 @@ inline variable_list CopySlices::apply_impl(
  // see Note [Thread Safety on Autograd Node]
  std::lock_guard<std::mutex> lock(mutex_);

-  TORCH_CHECK(fn, ERR_BACKWARD_TWICE);
+  if (!fn) {
+    throw std::runtime_error(ERR_BACKWARD_TWICE);
+  }

  auto result =
      grad.new_empty_strided_symint(base.sym_sizes(), base.sym_strides());
@ -250,7 +252,9 @@ variable_list CopySlices::apply_with_saved(

  auto results = variable_list(num_outputs());
  if (grads[0].defined()) {
-    TORCH_CHECK(fn, ERR_BACKWARD_TWICE);
+    if (!fn) {
+      throw std::runtime_error(ERR_BACKWARD_TWICE);
+    }
    update_exec_info();

    std::vector<bool> needs_input_grad;
--- a/torch/csrc/autograd/functions/utils.cpp
+++ b/torch/csrc/autograd/functions/utils.cpp
@ -53,22 +53,18 @@ void check_input_variables(
  if (required_args == -1) {
    required_args = args;
  }
-  TORCH_CHECK(
-      inputs.size() == static_cast<size_t>(args),
-      name,
-      ": expected ",
-      args,
-      " arguments (got ",
-      inputs.size(),
-      ")");
-
+  if (inputs.size() != static_cast<size_t>(args)) {
+    std::stringstream ss;
+    ss << name << ": expected " << args << " arguments (got " << inputs.size();
+    ss << ")";
+    throw std::runtime_error(ss.str());
+  }
  for (const auto i : c10::irange(required_args)) {
-    TORCH_CHECK(
-        inputs[i].defined() || allow_undefined,
-        name,
-        ": expected Tensor at argument ",
-        i,
-        " (got None)");
+    if (!inputs[i].defined() && !allow_undefined) {
+      std::stringstream ss;
+      ss << name << ": expected Tensor at argument " << i << " (got None)";
+      throw std::runtime_error(ss.str());
+    }
  }
 }
 } // namespace torch::autograd
--- a/torch/csrc/autograd/init.cpp
+++ b/torch/csrc/autograd/init.cpp
@ -309,12 +309,8 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
          })
      .def("nbytes", [](const KinetoEvent& e) { return e.nBytes(); })
      // whether the event is hidden
-      .def(
-          "is_hidden_event",
-          [](const KinetoEvent& e) { return e.isHiddenEvent(); })
-      // KinetoEvent metadata
-      .def("metadata_json", [](const KinetoEvent& e) {
-        return e.metadataJson();
+      .def("is_hidden_event", [](const KinetoEvent& e) {
+        return e.isHiddenEvent();
      });

  m.def("_soft_assert_raises", &setSoftAssertRaises);
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@ -37,8 +37,7 @@ extern "C" {
 // https://github.com/pytorch/pytorch/issues/51026
 __attribute__((weak)) int acc_get_device_type();
 __attribute__((weak)) int acc_get_device_type() {
-  TORCH_CHECK(
-      false,
+  throw std::runtime_error(
      "Dummy implementation of acc_get_device_type is not supposed to be called!");
 }
 } // extern "C"
@ -1068,17 +1067,6 @@ void KinetoEvent::getPerfEventCounters(std::vector<uint64_t>& in) const {
      [](const auto&) -> void { return; }));
 }

-std::string KinetoEvent::metadataJson() const {
-  return result_->visit(c10::overloaded(
-      [](const ExtraFields<EventType::TorchOp>& op) -> std::string {
-        return op.metadata_json_;
-      },
-      [](const ExtraFields<EventType::Kineto>& op) -> std::string {
-        return op.metadata_json_;
-      },
-      [](const auto&) -> std::string { return std::string(""); }));
-}
-
 #define FORWARD_FROM_RESULT(method_name, result_expr)                        \
  decltype(std::declval<KinetoEvent>().method_name())                        \
  KinetoEvent::method_name() const {                                         \
--- a/torch/csrc/autograd/profiler_kineto.h
+++ b/torch/csrc/autograd/profiler_kineto.h
@ -65,7 +65,6 @@ struct TORCH_API KinetoEvent {
  int64_t privateuse1ElapsedUs() const;
  void getPerfEventCounters(torch::profiler::perf_counters_t&) const;
  extra_meta_t extraMeta() const;
-  std::string metadataJson() const;

 private:
  torch::profiler::impl::ProfilerVoidEventStub fallbackStart() const;
--- a/torch/csrc/autograd/profiler_legacy.h
+++ b/torch/csrc/autograd/profiler_legacy.h
@ -97,7 +97,7 @@ struct TORCH_API LegacyEvent {
      case EventKind::MemoryAlloc:
        return "memory_alloc";
    }
-    TORCH_CHECK(false, "unknown event kind");
+    throw std::runtime_error("unknown event kind");
  }

  EventKind kind() const {
--- a/torch/csrc/autograd/python_anomaly_mode.cpp
+++ b/torch/csrc/autograd/python_anomaly_mode.cpp
@ -30,7 +30,7 @@ void PyAnomalyMetadata::store_stack() {
 void PyAnomalyMetadata::print_stack(const std::string& current_node_name) {
  pybind11::gil_scoped_acquire gil;
  if (!PyDict_Check(dict())) {
-    TORCH_CHECK(false, "Anomaly metadata is not a python dictionary.");
+    throw std::runtime_error("Anomaly metadata is not a python dictionary.");
  }
  PyObject* trace_stack = nullptr;
  if (PyDict_GetItemStringRef(dict(), ANOMALY_TRACE_KEY, &trace_stack) < 0) {
--- a/torch/csrc/autograd/python_cpp_function.cpp
+++ b/torch/csrc/autograd/python_cpp_function.cpp
@ -261,7 +261,8 @@ PyTypeObject* _initFunctionPyTypeObject(
  type.tp_traverse = THPCppFunction_traverse;
  type.tp_clear = THPCppFunction_clear;
  if (PyType_Ready(&type) < 0) {
-    TORCH_CHECK(false, "Unable to instantiate PyTypeObject for ", name);
+    auto msg = std::string("Unable to instantiate PyTypeObject for ") + name;
+    throw std::runtime_error(msg);
  }
  return &type;
 }
--- a/torch/csrc/autograd/python_engine.cpp
+++ b/torch/csrc/autograd/python_engine.cpp
@ -501,7 +501,7 @@ static void child_atfork() {
 bool THPEngine_initModule(PyObject* module) {
 #ifndef _WIN32
  if (pthread_atfork(nullptr, nullptr, child_atfork) != 0) {
-    TORCH_CHECK(false, "unable to set pthread_atfork handler");
+    throw std::runtime_error("unable to set pthread_atfork handler");
  }
 #endif
  if (PyType_Ready(&THPEngineType) < 0)
--- a/torch/csrc/autograd/python_function.cpp
+++ b/torch/csrc/autograd/python_function.cpp
@ -188,15 +188,13 @@ auto PyNode::apply(variable_list&& inputs) -> variable_list {
  }

  // Now the number of gradients should match
-  TORCH_CHECK(
-      num_outputs == num_forward_inputs,
-      "function ",
-      name(),
-      " returned an incorrect number of gradients (expected ",
-      num_forward_inputs,
-      ", got ",
-      num_outputs,
-      ")");
+  if (num_outputs != num_forward_inputs) {
+    std::string msg("function ");
+    msg += name() + " returned an incorrect number of gradients (expected ";
+    msg += std::to_string(num_forward_inputs) + ", got ";
+    msg += std::to_string(num_outputs) + ")";
+    throw std::runtime_error(msg);
+  }

  // Massage the Python results tuple back into a C++ variable_list
  return to_variable_list(r.get(), is_variable_input);
@ -437,24 +435,24 @@ variable_list PyNode::to_variable_list(
    PyObject* output = PyTuple_GET_ITEM(outputs, i);
    bool was_variable = is_variable_input[i];
    if (!was_variable) {
-      TORCH_CHECK(
-          output == Py_None,
-          "function ",
-          name(),
-          " returned a gradient different than None at position ",
-          i + 1,
-          ", but the corresponding forward input was not a Variable");
+      if (output != Py_None) {
+        std::string msg("function ");
+        msg += name() + " returned a gradient different than None at position ";
+        msg += std::to_string(i + 1) +
+            ", but the corresponding forward input was not a Variable";
+        throw std::runtime_error(msg);
+      }
      continue;
    }
    if (output == Py_None) {
      results.emplace_back();
    } else {
-      TORCH_CHECK(
-          THPVariable_Check(output),
-          "expected Variable or None (got ",
-          THPUtils_typename(output),
-          ")");
-
+      if (!THPVariable_Check(output)) {
+        std::string msg("expected Variable or None (got ");
+        msg += THPUtils_typename(output);
+        msg += ")";
+        throw std::runtime_error(msg);
+      }
      results.emplace_back(THPVariable_Unpack(output));
    }
  }
--- a/torch/csrc/autograd/python_hook.cpp
+++ b/torch/csrc/autograd/python_hook.cpp
@ -289,7 +289,9 @@ static variable_list unwrap_variables(PyObject* py_variables) {
      results[i] = THPVariable_Unpack(item);
    } else {
      // this should never happen, but just in case...
-      TORCH_CHECK(false, "expected variable but got ", Py_TYPE(item)->tp_name);
+      std::stringstream ss;
+      ss << "expected variable but got " << Py_TYPE(item)->tp_name;
+      throw std::runtime_error(ss.str());
    }
  }
  return results;
@ -306,16 +308,14 @@ static void check_result(PyObject* prev, PyObject* result, PyObject* hook) {

  auto prev_size = PyTuple_GET_SIZE(prev);
  auto result_size = PyTuple_GET_SIZE(result);
-
-  TORCH_CHECK(
-      prev_size == result_size,
-      "hook '",
-      hook_name(hook),
-      "' has returned an incorrect number of values (got ",
-      result_size,
-      ", but expected ",
-      prev_size,
-      ")");
+  if (prev_size != result_size) {
+    std::stringstream ss;
+    auto name = hook_name(hook);
+    ss << "hook '" << name << "' has returned an incorrect number ";
+    ss << "of values (got " << result_size << ", but expected ";
+    ss << prev_size << ")";
+    throw std::runtime_error(ss.str());
+  }

  for (const auto i : c10::irange(prev_size)) {
    check_single_result(
@ -330,9 +330,10 @@ static void check_single_result(
  if (_result == Py_None)
    return;

-  TORCH_CHECK(
-      _original != Py_None,
-      "can't replace a None gradient with a non-None value");
+  if (_original == Py_None) {
+    throw std::runtime_error(
+        "can't replace a None gradient with a non-None value");
+  }

  if (!PyObject_IsInstance(_result, THPVariableClass)) {
    PyErr_Format(
--- a/torch/csrc/autograd/python_torch_functions_manual.cpp
+++ b/torch/csrc/autograd/python_torch_functions_manual.cpp
@ -644,6 +644,15 @@ void initTorchFunctions(PyObject* module) {
            at::functionalization::impl::isFunctionalTensor(t));
        at::functionalization::impl::mark_mutation_hidden_from_autograd(t);
      });
+  py_module.def(
+      "_functionalize_apply_view_metas",
+      [](const at::Tensor& tensor, const at::Tensor& base) {
+        TORCH_INTERNAL_ASSERT(
+            at::functionalization::impl::isFunctionalTensor(tensor));
+        auto impl =
+            at::functionalization::impl::unsafeGetFunctionalWrapper(tensor);
+        return impl->apply_view_metas(base);
+      });
  py_module.def("_functionalize_is_symbolic", [](const at::Tensor& t) {
    TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(t));
    auto impl = at::functionalization::impl::unsafeGetFunctionalWrapper(t);
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@ -265,7 +265,8 @@ PyObject* THPVariable_Wrap(const at::TensorBase& var) {
  }

  std::optional<PyObject*> mb_obj =
-      var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj();
+      var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
+          /*ignore_hermetic_tls=*/false);
  if (mb_obj.has_value()) {
    auto obj = *mb_obj;
    if (obj) {
@ -328,8 +329,8 @@ static bool isResurrectable(THPVariable* self) {
    return false;
  }
  // Check if this is hermetic. If it is, no resurrection.
-  if (tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj() !=
-      (PyObject*)self) {
+  if (tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
+          /*ignore_hermetic_tls=*/false) != (PyObject*)self) {
    return false;
  }
  return true;
@ -354,7 +355,8 @@ static bool THPVariable_tryResurrect(THPVariable* self) {
      !tensor.unsafeGetTensorImpl()->pyobj_slot()->owns_pyobj());

  c10::TensorImpl* tensor_impl = tensor.unsafeGetTensorImpl();
-  auto maybe_pyobj = tensor_impl->pyobj_slot()->check_pyobj();
+  auto maybe_pyobj = tensor_impl->pyobj_slot()->check_pyobj(
+      /*ignore_hermetic_tls=*/false);

  TORCH_INTERNAL_ASSERT(
      maybe_pyobj.has_value(),
@ -2221,8 +2223,8 @@ static int THPVariable_subclass_clear(THPVariable* self) {
    //        because Tensor asked us to (it's already destructing).

    if (!self->cdata.unsafeIsBorrowed() &&
-        tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj() ==
-            (PyObject*)self) {
+        tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
+            /*ignore_hermetic_tls=*/false) == (PyObject*)self) {
      // TODO: empirically, on OS X this assert appears to be untrue
      // In test_py_tensors_multi_async_call - ProcessGroupRpcTestWithSpawn
      // distributed/rpc/test_process_group_agent.py
@ -2408,7 +2410,8 @@ static PyObject* THPVariable_NewWithVar(

  // This function overwrite the Tensor's pyobj field without extra checks
  // Make sure it is not set otherwise we would leak memory
-  auto mb_obj = _var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj();
+  auto mb_obj = _var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
+      /*ignore_hermetic_tls=*/false);

  // Under some circumstances, we may attempt to create a new Python
  // object for a variable that already has a Python object.  The most common
--- a/torch/csrc/autograd/saved_variable_hooks.h
+++ b/torch/csrc/autograd/saved_variable_hooks.h
@ -11,8 +11,8 @@ struct TORCH_API SavedVariableHooks {
  virtual ~SavedVariableHooks() = default;
  virtual std::optional<std::pair<c10::SafePyObject, c10::SafePyObject>>
  retrieve_unpack_hook_data() const {
-    TORCH_CHECK(
-        false, "Compiled Autograd only supports python saved tensor hooks ");
+    throw std::runtime_error(
+        "Compiled Autograd only supports python saved tensor hooks ");
  }
 };

--- a/torch/csrc/autograd/utils/python_arg_parsing.h
+++ b/torch/csrc/autograd/utils/python_arg_parsing.h
@ -17,8 +17,8 @@ inline std::tuple<
    std::optional<at::MemoryFormat>>
 parse_to_conversion(PythonArgs& r, bool allow_copy) {
  if (r.idx == 0) {
-    TORCH_CHECK(
-        allow_copy || r.isNone(3), ".to() does not accept copy argument");
+    if (!allow_copy && !r.isNone(3))
+      throw std::runtime_error(".to() does not accept copy argument");
    return std::make_tuple(
        r.deviceOptional(0),
        r.scalartypeOptional(1),
@ -26,8 +26,8 @@ parse_to_conversion(PythonArgs& r, bool allow_copy) {
        r.toBool(3),
        r.memoryformatOptional(4));
  } else if (r.idx == 1) {
-    TORCH_CHECK(
-        allow_copy || r.isNone(2), ".to() does not accept copy argument");
+    if (!allow_copy && !r.isNone(2))
+      throw std::runtime_error(".to() does not accept copy argument");
    return std::make_tuple(
        std::nullopt,
        r.scalartype(0),
@ -36,8 +36,8 @@ parse_to_conversion(PythonArgs& r, bool allow_copy) {
        r.memoryformatOptional(3));
  } else {
    auto tensor = r.tensor(0);
-    TORCH_CHECK(
-        allow_copy || r.isNone(2), ".to() does not accept copy argument");
+    if (!allow_copy && !r.isNone(2))
+      throw std::runtime_error(".to() does not accept copy argument");
    return std::make_tuple(
        tensor.device(),
        tensor.scalar_type(),
--- a/torch/csrc/autograd/variable.cpp
+++ b/torch/csrc/autograd/variable.cpp
@ -597,9 +597,10 @@ void VariableHooks::_backward(
 void VariableHooks::requires_grad_(
    const at::TensorBase& self,
    bool _requires_grad) const {
-  TORCH_CHECK(
-      self.is_leaf() || _requires_grad,
-      autograd::utils::requires_grad_leaf_error(_requires_grad));
+  if (!self.is_leaf() && !_requires_grad) {
+    throw std::runtime_error(
+        autograd::utils::requires_grad_leaf_error(_requires_grad));
+  }
  self.set_requires_grad(_requires_grad);
 }

@ -623,7 +624,7 @@ const at::TensorBase& VariableHooks::base(const at::TensorBase& self) const {
        "Can't get base of non-backward view Tensor");
    return diff_view_meta->get_backward_view().base_;
  } else {
-    TORCH_CHECK(false, "Can't get base of non-view Tensor");
+    throw std::runtime_error("Can't get base of non-view Tensor");
  }
 }

--- a/torch/csrc/distributed/c10d/HashStore.cpp
+++ b/torch/csrc/distributed/c10d/HashStore.cpp
@ -1,6 +1,5 @@
 #include <torch/csrc/distributed/c10d/HashStore.hpp>

-#include <unistd.h>
 #include <cstdint>

 #include <chrono>
--- a/torch/csrc/distributed/c10d/Work.cpp
+++ b/torch/csrc/distributed/c10d/Work.cpp
@ -1,5 +1,5 @@
 #include <ATen/ThreadLocalState.h>
-#include <distributed/c10d/ProcessGroup.hpp>
+#include <torch/csrc/distributed/c10d/ProcessGroup.hpp>
 #include <torch/csrc/distributed/c10d/cuda/StreamBlock.hpp>

 #include <torch/csrc/distributed/c10d/Work.hpp>
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Blaine Burton Rister	1175db1e14	nit cleanup	2025-09-26 14:38:18 -07:00
Blaine Burton Rister	ee9ff543ea	revert kineto	2025-09-25 17:16:48 -07:00
Blaine Burton Rister	a01600a0b4	add unit tests. Handle some edge cases	2025-09-25 17:08:22 -07:00
Blaine Burton Rister	de01602ff6	clean up replacement logic	2025-09-25 15:46:19 -07:00
Blaine Burton Rister	3460bd6897	Merge branch 'main' into brister/fx_no_python_slow	2025-09-25 15:34:46 -07:00
Blaine Burton Rister	c53a2ae78e	better handling of multiple pow's. Also, check for -1 as rational	2025-09-25 00:36:58 -07:00
Blaine Burton Rister	194480f0ec	handle pow args other than the first	2025-09-24 19:19:40 -07:00
Blaine Burton Rister	0a11d1d0db	remove python_slow	2025-09-24 17:57:53 -07:00