wip nested

[do-not-land][ca] default on for CI
ghstack-source-id: a5cf35a72ee0d8f53736737d0425c15863e47c71 Pull Request resolved: https://github.com/pytorch/pytorch/pull/152646
2025-11-01 13:34:57 +08:00 · 2025-05-29 13:25:22 -07:00 · 2025-05-28 16:41:03 -07:00 · 2025-05-28 08:57:43 -07:00 · 2025-05-28 08:57:43 -07:00
17 changed files with 160 additions and 39 deletions
--- a/test/higher_order_ops/test_with_effects.py
+++ b/test/higher_order_ops/test_with_effects.py
@ -76,7 +76,8 @@ def get_fw_bw_graph(
    )(*inps)

    if requires_grad:
-        out.sum().backward()
+        with torch._dynamo.compiled_autograd._disable():
+            out.sum().backward()

    return (fw_graph_cell[0], bw_graph_cell[0])

@ -429,7 +430,9 @@ def forward(self, arg0_1, arg1_1, arg2_1):

    @skipIfNoDynamoSupport
    def test_effectful_custom_op_with_subclasses(self):
-        with torch.library._scoped_library("_mylib", "FRAGMENT") as lib:
+        with torch.library._scoped_library(
+            "_mylib", "FRAGMENT"
+        ) as lib, torch._dynamo.compiled_autograd._disable():
            lib.define("zoo(Tensor x) -> Tensor")
            lib.define("zoo2(Tensor x) -> Tensor")

--- a/test/inductor/test_compiled_autograd.py
+++ b/test/inductor/test_compiled_autograd.py
@ -4269,6 +4269,7 @@ def wrap_test_class(orig_cls):
                        fullgraph=name not in known_graph_breaks_tests,
                    )
                ),
+                config.patch(compiled_autograd=True),
                test_contexts.get(name, contextlib.nullcontext()),
            ]
            dct[name] = make_wrapped(fn, ctxs)
@ -4373,6 +4374,10 @@ known_graph_breaks_tests = {
    "test_nested_checkpoint_set_early_stop",  # dynamo disable
    "test_nested_checkpoint_two_children_early_stop_False",  # dynamo disable
    "test_nested_checkpoint_two_children_early_stop_True",  # dynamo disable
+    "test_dropout",  # dynamo disable
+    "test_dropout_inductor",  # dynamo disable
+    "test_function_with_kwargs",  # dynamo disable
+    "test_module",  # dynamo disable
 }

 test_contexts = {
@ -4435,6 +4440,10 @@ xfail_by_backend = {
        "test_nested_checkpoint_early_stop_True",  # unpack hook grad_fn semantics
        "test_nested_checkpoint_two_children_early_stop_False",  # unpack hook grad_fn semantics
        "test_nested_checkpoint_two_children_early_stop_True",  # unpack hook grad_fn semantics
+        "test_dropout",  # functionalize_rng_ops not yet supported
+        "test_dropout_inductor",  # functionalize_rng_ops not yet supported
+        "test_function_with_kwargs",  # functionalize_rng_ops not yet supported
+        "test_module",  # functionalize_rng_ops not yet supported
    },
    "eager": {  # will be run without torch.compiling the CA graph
        "test_setup_context_when_forward_has_default_args",  # autograd.Function with class methods
@ -4482,6 +4491,9 @@ xfail_divergence_from_eager = {
    "test_inplace_on_view_backward",  # different node name: CompiledFunctionBackward
    "test_nested_anomaly_printstack_cleanup",  # anomaly NaN error message different
    "test_not_implemented_grad",  # Dynamo changes the types of exceptions
+    "test_grad_call_compiled_backward_fn",  # different functorch error
+    "test_vjp_call_compiled_backward_fn",  # different functorch error
+    "test_vmap_call_compiled_backward_fn",  # different functorch error
 }

 skipped_tests = set()
@ -4495,26 +4507,30 @@ if IS_S390X:

 test_autograd = load_test_module("test_autograd")
 test_custom_ops = load_test_module("test_custom_ops")
+test_higher_order_ops = load_test_module("dynamo/test_higher_order_ops")

 TestAutogradWithCompiledAutograd = wrap_test_class(test_autograd.TestAutograd)
 TestNestedCheckpointWithCompiledAutograd = wrap_test_class(
    test_autograd.TestNestedCheckpoint
 )
 TestCustomOpWithCompiledAutograd = wrap_test_class(test_custom_ops.TestCustomOp)
+HigherOrderOpTestsWithCompiledAutograd = wrap_test_class(
+    test_higher_order_ops.HigherOrderOpTests
+)
+FuncTorchHigherOrderOpTestsWithCompiledAutograd = wrap_test_class(
+    test_higher_order_ops.FuncTorchHigherOrderOpTests
+)
+ActivationCheckpointingTestsWithCompiledAutograd = wrap_test_class(
+    test_higher_order_ops.ActivationCheckpointingTests
+)
+
 if torch.distributed.is_available() and HAS_CUDA:
    test_dtensor = load_test_module("distributed/tensor/test_dtensor_compile")
    TestDTensorCompileWithCompiledAutograd = wrap_test_class(
        test_dtensor.TestDTensorCompile
    )

-xfail_hops = {
-    # AssertionError: Tensor-likes are not close!
-    "auto_functionalize",
-    # BypassAOTAutogradCache: Cannot cache a graph with compiled autograd enabled
-    "invoke_subgraph",
-    # AssertionError: assert type(args[1].realize()) is TensorVariable
-    "map",
-}
+xfail_hops = {}


 class TestCompiledAutogradOpInfo(TestCase):
@ -4561,7 +4577,7 @@ class TestCompiledAutogradOpInfo(TestCase):
            # 1. Run eager
            torch.manual_seed(123)
            dummy = torch.randn(2, 2, dtype=dtype, device=device, requires_grad=True)
-            fn, op_out_ref = create_bwd_fn_closure(compiled_args, compiled_kwargs)
+            fn, op_out_ref = create_bwd_fn_closure(eager_args, eager_kwargs)
            fn(dummy).backward()
            self.assertEqual(len(op_out_ref), 1)
            expected = op_out_ref[0]
@ -4578,7 +4594,7 @@ class TestCompiledAutogradOpInfo(TestCase):
            self.assertEqual(expected, actual)


-instantiate_device_type_tests(TestCompiledAutogradOpInfo, globals(), only_for=("cpu",))
+instantiate_device_type_tests(TestCompiledAutogradOpInfo, globals())
 instantiate_parametrized_tests(TestCompiledAutograd)

 if __name__ == "__main__":
--- a/torch/_dynamo/compiled_autograd.py
+++ b/torch/_dynamo/compiled_autograd.py
@ -308,10 +308,12 @@ class AutogradCompilerInstance:
        self.stack.enter_context(preserve_node_meta())
        inputs_origins, sizes_origins, scalars_origins = origins
        # tensor inputs to fake tensors
-        inputs = [
-            self.wrap_fake(x, self.source("inputs", idx))
-            for idx, x in enumerate(inputs)
-        ]
+        # if strided nested tensor, can't fakify, must copy
+        # inputs = [
+        #     self.wrap_fake(x, self.source("inputs", idx))
+        #     for idx, x in enumerate(inputs)
+        # ]
+        inputs = [self.allocate_dummy() for x in inputs]
        self.bind_objects_to_proxies(inputs, args_proxy, inputs_origins)

        # size inputs to symints
@ -1356,6 +1358,8 @@ class AutogradCompilerInstance:
 # state of the autograd engine dispatch, kept in sync by enable/disable context managers
 compiled_autograd_enabled = False

+nested_level = 0
+
 # global flag to check if compiled autograd is enabled but Dynamo stance is "force_eager"
 compiled_autograd_enabled_force_eager = False

@ -1414,12 +1418,15 @@ def _enable(compiler_fn, dynamic: bool = True):
            torch._C._dynamo.compiled_autograd.set_verbose_logger(verbose_log)
        global compiled_autograd_enabled
        compiled_autograd_enabled = True
+        global nested_level
+        nested_level += 1
        try:
            with torch.autograd.set_multithreading_enabled(False):
                yield
        finally:
            if not prior_compiler:
                compiled_autograd_enabled = False
+            nested_level -= 1
            torch._C._dynamo.compiled_autograd.set_autograd_compiler(
                prior_compiler, prior_dynamic
            )
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@ -810,7 +810,12 @@ class OptimizeContext(_TorchDynamoContext):
            if _dynamic is None:
                _dynamic = not torch._dynamo.config.assume_static_by_default

+            def noop():
+                pass
+
            def call_compiled_autograd():
+                if torch._dynamo.compiled_autograd.nested_level > 0:
+                    return noop
                assert rebuild_ctx is not None
                compiler_fn = rebuild_ctx()
                ctx = torch._dynamo.compiled_autograd._enable(
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@ -2470,6 +2470,7 @@ def _clone_input(value, fake_mode):
            )
            or value.is_nested
        ):
+            breakpoint()
            # NB: ensure strides are preserved
            value = clone_input(value)

@ -3257,6 +3258,7 @@ def wrap_to_fake_tensor_and_record(
            type(e),
        )

+        breakpoint()
        fake_e = wrap_fake_exception(
            lambda: tx.fake_mode.from_tensor(
                e,
--- a/torch/_functorch/eager_transforms.py
+++ b/torch/_functorch/eager_transforms.py
@ -138,14 +138,15 @@ def _autograd_grad(
            diff_outputs, grad_outputs = zip(*result)
    if len(diff_outputs) == 0:
        return tuple(torch.zeros_like(inp) for inp in inputs)
-    grad_inputs = torch.autograd.grad(
-        diff_outputs,
-        inputs,
-        grad_outputs,
-        retain_graph=retain_graph,
-        create_graph=create_graph,
-        allow_unused=True,
-    )
+    with torch._dynamo.compiled_autograd._disable():
+        grad_inputs = torch.autograd.grad(
+            diff_outputs,
+            inputs,
+            grad_outputs,
+            retain_graph=retain_graph,
+            create_graph=create_graph,
+            allow_unused=True,
+        )
    grad_inputs = tuple(
        torch.zeros_like(inp) if gi is None else gi
        for gi, inp in zip(grad_inputs, inputs)
--- a/torch/_higher_order_ops/base_hop.py
+++ b/torch/_higher_order_ops/base_hop.py
@ -218,7 +218,12 @@ class BaseHOPFunction(torch.autograd.Function):
        kwargs = ctx.kwargs

        # TODO: Something special needs to happen with min cut partitioner
-        with suspend_functionalization(), disable_functional_mode(), torch.enable_grad():
+        with (
+            suspend_functionalization(),
+            disable_functional_mode(),
+            torch.enable_grad(),
+            torch._dynamo.compiled_autograd._disable(),
+        ):
            with disable_proxy_modes_tracing():
                from .invoke_subgraph import create_fw_bw_graph
                from .utils import _from_fun
--- a/torch/_higher_order_ops/invoke_subgraph.py
+++ b/torch/_higher_order_ops/invoke_subgraph.py
@ -207,7 +207,7 @@ def trace_joint_graph(fn, fw_inputs, fw_outputs):
 # TODO (@anijain2305) - Delete this function when base_hop uses invoke_subgraph infra
 def create_fw_bw_graph(subgraph, operands, grad_outputs=None):
    with suspend_functionalization(), disable_functional_mode():
-        with disable_proxy_modes_tracing():
+        with disable_proxy_modes_tracing(), torch._dynamo.compiled_autograd._disable():
            # args are functional tensors, generate some example tensors
            fw_inputs = pytree.tree_map(_from_fun, operands)

--- a/torch/_higher_order_ops/map.py
+++ b/torch/_higher_order_ops/map.py
@ -46,7 +46,7 @@ def create_fw_bw_graph(f, num_mapped_args, *args):

    # See Note [HOP create fw_bw graph] in create_fw_bw_graph in utils.py

-    with suspend_functionalization(), disable_functional_mode():
+    with suspend_functionalization(), disable_functional_mode(), torch._dynamo.compiled_autograd._disable():
        with disable_proxy_modes_tracing():
            unwrapped_mapped_xs = pytree.tree_map(_from_fun, mapped_xs)
            example_xs = _unstack_pytree(unwrapped_mapped_xs)[0]
--- a/torch/autograd/gradcheck.py
+++ b/torch/autograd/gradcheck.py
@ -767,9 +767,14 @@ def _check_analytical_jacobian_attributes(
    diff_input_list = list(_iter_tensors(inputs, True))

    def vjp_fn(grad_output):
-        return torch.autograd.grad(
-            output, diff_input_list, grad_output, retain_graph=True, allow_unused=True
-        )
+        with torch._dynamo.compiled_autograd._disable():
+            return torch.autograd.grad(
+                output,
+                diff_input_list,
+                grad_output,
+                retain_graph=True,
+                allow_unused=True,
+            )

    # Compute everything twice to check for nondeterminism (which we call reentrancy)
    if fast_mode:
@ -1969,6 +1974,13 @@ def gradcheck(
    fast_mode: bool = False,
    masked: Optional[bool] = None,
 ) -> bool:  # noqa: D400,D205
+    if torch._dynamo.config.compiled_autograd:
+        if not check_backward_ad:
+            return True
+        check_batched_grad = False
+        check_batched_forward_grad = False
+        check_forward_ad = False
+        check_undefined_grad = False
    r"""Check gradients computed via small finite differences against analytical
    gradients wrt tensors in :attr:`inputs` that are of floating point or complex type
    and with ``requires_grad=True``.
@ -2135,6 +2147,10 @@ def gradgradcheck(
    fast_mode: bool = False,
    masked: bool = False,
 ) -> bool:  # noqa: D400,D205
+    if torch._dynamo.config.compiled_autograd:
+        check_undefined_grad = False
+        check_batched_grad = False
+        check_undefined_grad = True
    r"""Check gradients of gradients computed via small finite differences
    against analytical gradients wrt tensors in :attr:`inputs` and
    :attr:`grad_outputs` that are of floating point or complex type and with
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@ -226,6 +226,7 @@ struct TORCH_API Node : std::enable_shared_from_this<Node> {
  }

  const InputMetadata& input_metadata(size_t index) const {
+    std::cout << "index=" << index << ", size=" << input_metadata_.size() << std::endl;
    return input_metadata_[index];
  }

--- a/torch/csrc/autograd/input_metadata.h
+++ b/torch/csrc/autograd/input_metadata.h
@ -97,8 +97,9 @@ struct TORCH_API InputMetadata {
  // Danger: not thread safe, caller must protect with lock
  SymIntSmallVec& mutable_shape_as_dim_vector();

- private:
  at::Tensor shape_as_tensor() const;
+
+ private:
  bool is_nestedness_same(const at::Tensor& grad) const;
  bool maybe_expandable_to(const at::Tensor& grad) const;

--- a/torch/csrc/dynamo/compiled_autograd.h
+++ b/torch/csrc/dynamo/compiled_autograd.h
@ -564,17 +564,34 @@ class CompiledNodeArgs {
    collect_hooks_from(t.node.get());
  }
  void collect(const Edge& t) {
+    std::cout << "collecting edge start" << std::endl;
    if (cond(t.is_valid())) {
      collect_size(_compiler.node_calls.lookup(t.function).id);
      collect_size(t.input_nr);
+      std::cout << "collecting edge input metadata" << std::endl;
      collect(t.function->input_metadata(t.input_nr)); // for validate_outputs
    }
+    std::cout << "collecting edge done" << std::endl;
  }
  void collect(const InputMetadata& t) {
-    TORCH_CHECK(!t.is_nested_tensor(), "NestedTensor not implemented");
+    // TORCH_CHECK(!t.is_nested_tensor(), "NestedTensor not implemented");
+    std::cout << "collect options" << std::endl;
    collect(t.options());
+    std::cout << "collect subclass" << std::endl;
    collect(t.is_tensor_subclass());
-    collect(t.shape_as_dim_vector());
+    std::cout << "collect shape" << std::endl;
+    // need to collect
+    // 1. nestedness
+    // 2. shapes to pass in and reconstruct... or fk dynamic shapes
+    // nested tensors store their shape as a ... tensor?
+    // and the values matter.
+    // we dynamic it by just not collecting
+    // should be safe?
+    if (t.is_nested_tensor()) {
+      // t.shape_as_tensor();
+    } else {
+      collect(t.shape_as_dim_vector());
+    }
  }
  void collect(const VariableInfo& t) {
    collect(t.layout);
@ -1448,10 +1465,15 @@ struct IValuePacker<at::TensorGeometry> {
 template <>
 struct IValuePacker<InputMetadata> {
  static at::IValue pack(const InputMetadata& t) {
-    TORCH_INTERNAL_ASSERT(!t.is_nested_tensor());
+    if (!t.is_nested_tensor()) {
+      auto input_shape = t.shape_as_dim_vector().vec();
+    } else {
+
+    }
+
    auto tuple = std::make_tuple(
        pack_TensorOptions(t.options()),
-        t.shape_as_dim_vector().vec(),
+        input_shape,
        t.is_tensor_subclass());
    return tuple;
  }
--- a/torch/csrc/dynamo/python_compiled_autograd.cpp
+++ b/torch/csrc/dynamo/python_compiled_autograd.cpp
@ -1,5 +1,6 @@
 #include <torch/csrc/dynamo/python_compiled_autograd.h>

+#include <ATen/LegacyVmapMode.h>
 #include <torch/csrc/autograd/engine.h>
 #include <torch/csrc/autograd/functions/accumulate_grad.h>
 #include <torch/csrc/autograd/python_function.h>
@ -918,7 +919,9 @@ static CacheNode* _compiled_autograd_impl(
      }
      node_args.collect(call);
      if (node_args.cond(call.needed)) {
+        std::cout << "compiled_args on " << fn->name() << std::endl;
        fn->compiled_args(node_args);
+        std::cout << "next edges on " << fn->name() << std::endl;
        node_args.collect(call.node->next_edges());
      }
      CacheKey key = node_args.key();
@ -936,6 +939,7 @@ static CacheNode* _compiled_autograd_impl(
      cache = cache->lookup(key);
    }

+    std::cout << "collecting edges on " << fn->name() << std::endl;
    for (const auto& edge : fn->next_edges()) {
      if (!edge.is_valid()) {
        continue;
@ -1191,6 +1195,9 @@ static variable_list compiled_autograd(
  TORCH_CHECK(
      c10::impl::TorchDispatchModeTLS::stack_len() == 0,
      "TorchDispatchMode not yet implemented for compiled autograd")
+  TORCH_CHECK(
+      at::impl::VmapMode::current_vmap_level() == 0,
+      "torch.vmap not yet implemented for compiled autograd")
  static std::mutex mtx;
  LockGuardWithErrorLogs lock_guard(mtx);
  pybind11::gil_scoped_acquire gil;
--- a/torch/fx/experimental/proxy_tensor.py
+++ b/torch/fx/experimental/proxy_tensor.py
@ -2271,6 +2271,7 @@ def make_fx(
    record_module_stack: bool = False,
    _allow_fake_constant: bool = False,
    _error_on_data_dependent_ops: bool = True,
+    _disable_compiled_autograd: bool = True,
 ) -> Callable[..., GraphModule]:
    """
    Given a function f, return a new function which when executed with valid
@ -2290,9 +2291,18 @@ def make_fx(
        _error_on_data_dependent_ops,
    )

-    @functools.wraps(f)
-    def wrapped(*args: object) -> GraphModule:
-        return make_fx_tracer.trace(f, *args)
+    if _disable_compiled_autograd:
+
+        @functools.wraps(f)
+        def wrapped(*args: object) -> GraphModule:
+            with torch._dynamo.compiled_autograd._disable():
+                return make_fx_tracer.trace(f, *args)
+
+    else:
+
+        @functools.wraps(f)
+        def wrapped(*args: object) -> GraphModule:
+            return make_fx_tracer.trace(f, *args)

    return wrapped

--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@ -11,6 +11,7 @@ import argparse
 import contextlib
 import copy
 import ctypes
+from pathlib import PosixPath
 import errno
 import functools
 import gc
@ -1555,6 +1556,10 @@ TEST_WITH_TORCHDYNAMO: bool = TestEnvironment.def_flag(
    env_var="PYTORCH_TEST_WITH_DYNAMO",
    implied_by_fn=lambda: TEST_WITH_TORCHINDUCTOR or TEST_WITH_AOT_EAGER,
 )
+TEST_DISABLE_CA: bool = TestEnvironment.def_flag(
+    "TEST_DISABLE_CA",
+    env_var="PYTORCH_DISABLE_CA",
+)

 if TEST_WITH_TORCHDYNAMO:
    import torch._dynamo
@ -1567,6 +1572,8 @@ if TEST_WITH_TORCHDYNAMO:
    if TEST_WITH_TORCHINDUCTOR:
        import torch._inductor.config
        torch._inductor.config.fallback_random = True
+    else:
+        torch._dynamo.config.compiled_autograd = not TEST_DISABLE_CA


 # seems like this is only used in test/torch_np
@ -3308,11 +3315,25 @@ class TestCase(expecttest.TestCase):
                assert result.wasSuccessful() is False
            result.stop()

+    def skip_test(self, result):
+        bad_paths = ["test_jit.py", "test_jit_fuser_te.py"]
+        if path := getattr(result, "path", None):
+            if isinstance(path, PosixPath):
+                path_str = path.as_posix()
+                print(f"skipping CA test {path_str=}")
+                for bad_path in bad_paths:
+                    if bad_path in path_str:
+                        return True
+        return False
+
+

    def run(self, result=None):
        with contextlib.ExitStack() as stack:
            if TEST_WITH_CROSSREF:
                stack.enter_context(CrossRefMode())
+            if torch._dynamo.compiled_autograd and self.skip_test(result):
+                return
            self._run_custom(
                result=result,
            )
@ -5311,6 +5332,9 @@ class TestGradients(TestCase):

    def _check_helper(self, device, dtype, op, variant, check, *, check_forward_ad=False, check_backward_ad=True,
                      check_batched_grad=None, check_batched_forward_grad=False):
+        if torch._dynamo.config.compiled_autograd:
+            check_batched_grad = False
+            check_batched_forward_grad = False
        assert check in ('gradcheck', 'bwgrad_bwgrad', 'fwgrad_bwgrad')
        # NB: check_backward_ad does not affect gradgradcheck (always True)
        if variant is None:
--- a/torch/utils/checkpoint.py
+++ b/torch/utils/checkpoint.py
@ -316,7 +316,8 @@ class CheckpointFunction(torch.autograd.Function):
                "none of output has requires_grad=True,"
                " this checkpoint() is not necessary"
            )
-        torch.autograd.backward(outputs_with_grad, args_with_grad)
+        with torch._dynamo.compiled_autograd._disable():
+            torch.autograd.backward(outputs_with_grad, args_with_grad)
        grads = tuple(
            inp.grad if isinstance(inp, torch.Tensor) else None
            for inp in detached_inputs
Author	SHA1	Message	Date
Simon Fan	573407ca83	wip nested	2025-05-29 13:25:22 -07:00
Simon Fan	091826fa41	[do-not-land][ca] default on for CI ghstack-source-id: a5cf35a72ee0d8f53736737d0425c15863e47c71 Pull Request resolved: https://github.com/pytorch/pytorch/pull/152646	2025-05-28 16:41:03 -07:00
Simon Fan	b7b3530b9e	[ca] disable ca for functorch and run all HOO tests ghstack-source-id: b4bb87f11704fc348e0b9e60886cae62b43b20cc Pull Request resolved: https://github.com/pytorch/pytorch/pull/154147	2025-05-28 08:57:43 -07:00
Simon Fan	4eb9024ebd	[ca] fix hop_db tests ghstack-source-id: 8b9cc388d1c10da4b1a06082ec5c9ed5d29b4e71 Pull Request resolved: https://github.com/pytorch/pytorch/pull/154133	2025-05-28 08:57:43 -07:00