Compare commits

...

4 Commits

Author SHA1 Message Date
573407ca83 wip nested 2025-05-29 13:25:22 -07:00
091826fa41 [do-not-land][ca] default on for CI
ghstack-source-id: a5cf35a72ee0d8f53736737d0425c15863e47c71
Pull Request resolved: https://github.com/pytorch/pytorch/pull/152646
2025-05-28 16:41:03 -07:00
b7b3530b9e [ca] disable ca for functorch and run all HOO tests
ghstack-source-id: b4bb87f11704fc348e0b9e60886cae62b43b20cc
Pull Request resolved: https://github.com/pytorch/pytorch/pull/154147
2025-05-28 08:57:43 -07:00
4eb9024ebd [ca] fix hop_db tests
ghstack-source-id: 8b9cc388d1c10da4b1a06082ec5c9ed5d29b4e71
Pull Request resolved: https://github.com/pytorch/pytorch/pull/154133
2025-05-28 08:57:43 -07:00
17 changed files with 160 additions and 39 deletions

View File

@ -76,7 +76,8 @@ def get_fw_bw_graph(
)(*inps)
if requires_grad:
out.sum().backward()
with torch._dynamo.compiled_autograd._disable():
out.sum().backward()
return (fw_graph_cell[0], bw_graph_cell[0])
@ -429,7 +430,9 @@ def forward(self, arg0_1, arg1_1, arg2_1):
@skipIfNoDynamoSupport
def test_effectful_custom_op_with_subclasses(self):
with torch.library._scoped_library("_mylib", "FRAGMENT") as lib:
with torch.library._scoped_library(
"_mylib", "FRAGMENT"
) as lib, torch._dynamo.compiled_autograd._disable():
lib.define("zoo(Tensor x) -> Tensor")
lib.define("zoo2(Tensor x) -> Tensor")

View File

@ -4269,6 +4269,7 @@ def wrap_test_class(orig_cls):
fullgraph=name not in known_graph_breaks_tests,
)
),
config.patch(compiled_autograd=True),
test_contexts.get(name, contextlib.nullcontext()),
]
dct[name] = make_wrapped(fn, ctxs)
@ -4373,6 +4374,10 @@ known_graph_breaks_tests = {
"test_nested_checkpoint_set_early_stop", # dynamo disable
"test_nested_checkpoint_two_children_early_stop_False", # dynamo disable
"test_nested_checkpoint_two_children_early_stop_True", # dynamo disable
"test_dropout", # dynamo disable
"test_dropout_inductor", # dynamo disable
"test_function_with_kwargs", # dynamo disable
"test_module", # dynamo disable
}
test_contexts = {
@ -4435,6 +4440,10 @@ xfail_by_backend = {
"test_nested_checkpoint_early_stop_True", # unpack hook grad_fn semantics
"test_nested_checkpoint_two_children_early_stop_False", # unpack hook grad_fn semantics
"test_nested_checkpoint_two_children_early_stop_True", # unpack hook grad_fn semantics
"test_dropout", # functionalize_rng_ops not yet supported
"test_dropout_inductor", # functionalize_rng_ops not yet supported
"test_function_with_kwargs", # functionalize_rng_ops not yet supported
"test_module", # functionalize_rng_ops not yet supported
},
"eager": { # will be run without torch.compiling the CA graph
"test_setup_context_when_forward_has_default_args", # autograd.Function with class methods
@ -4482,6 +4491,9 @@ xfail_divergence_from_eager = {
"test_inplace_on_view_backward", # different node name: CompiledFunctionBackward
"test_nested_anomaly_printstack_cleanup", # anomaly NaN error message different
"test_not_implemented_grad", # Dynamo changes the types of exceptions
"test_grad_call_compiled_backward_fn", # different functorch error
"test_vjp_call_compiled_backward_fn", # different functorch error
"test_vmap_call_compiled_backward_fn", # different functorch error
}
skipped_tests = set()
@ -4495,26 +4507,30 @@ if IS_S390X:
test_autograd = load_test_module("test_autograd")
test_custom_ops = load_test_module("test_custom_ops")
test_higher_order_ops = load_test_module("dynamo/test_higher_order_ops")
TestAutogradWithCompiledAutograd = wrap_test_class(test_autograd.TestAutograd)
TestNestedCheckpointWithCompiledAutograd = wrap_test_class(
test_autograd.TestNestedCheckpoint
)
TestCustomOpWithCompiledAutograd = wrap_test_class(test_custom_ops.TestCustomOp)
HigherOrderOpTestsWithCompiledAutograd = wrap_test_class(
test_higher_order_ops.HigherOrderOpTests
)
FuncTorchHigherOrderOpTestsWithCompiledAutograd = wrap_test_class(
test_higher_order_ops.FuncTorchHigherOrderOpTests
)
ActivationCheckpointingTestsWithCompiledAutograd = wrap_test_class(
test_higher_order_ops.ActivationCheckpointingTests
)
if torch.distributed.is_available() and HAS_CUDA:
test_dtensor = load_test_module("distributed/tensor/test_dtensor_compile")
TestDTensorCompileWithCompiledAutograd = wrap_test_class(
test_dtensor.TestDTensorCompile
)
xfail_hops = {
# AssertionError: Tensor-likes are not close!
"auto_functionalize",
# BypassAOTAutogradCache: Cannot cache a graph with compiled autograd enabled
"invoke_subgraph",
# AssertionError: assert type(args[1].realize()) is TensorVariable
"map",
}
xfail_hops = {}
class TestCompiledAutogradOpInfo(TestCase):
@ -4561,7 +4577,7 @@ class TestCompiledAutogradOpInfo(TestCase):
# 1. Run eager
torch.manual_seed(123)
dummy = torch.randn(2, 2, dtype=dtype, device=device, requires_grad=True)
fn, op_out_ref = create_bwd_fn_closure(compiled_args, compiled_kwargs)
fn, op_out_ref = create_bwd_fn_closure(eager_args, eager_kwargs)
fn(dummy).backward()
self.assertEqual(len(op_out_ref), 1)
expected = op_out_ref[0]
@ -4578,7 +4594,7 @@ class TestCompiledAutogradOpInfo(TestCase):
self.assertEqual(expected, actual)
instantiate_device_type_tests(TestCompiledAutogradOpInfo, globals(), only_for=("cpu",))
instantiate_device_type_tests(TestCompiledAutogradOpInfo, globals())
instantiate_parametrized_tests(TestCompiledAutograd)
if __name__ == "__main__":

View File

@ -308,10 +308,12 @@ class AutogradCompilerInstance:
self.stack.enter_context(preserve_node_meta())
inputs_origins, sizes_origins, scalars_origins = origins
# tensor inputs to fake tensors
inputs = [
self.wrap_fake(x, self.source("inputs", idx))
for idx, x in enumerate(inputs)
]
# if strided nested tensor, can't fakify, must copy
# inputs = [
# self.wrap_fake(x, self.source("inputs", idx))
# for idx, x in enumerate(inputs)
# ]
inputs = [self.allocate_dummy() for x in inputs]
self.bind_objects_to_proxies(inputs, args_proxy, inputs_origins)
# size inputs to symints
@ -1356,6 +1358,8 @@ class AutogradCompilerInstance:
# state of the autograd engine dispatch, kept in sync by enable/disable context managers
compiled_autograd_enabled = False
nested_level = 0
# global flag to check if compiled autograd is enabled but Dynamo stance is "force_eager"
compiled_autograd_enabled_force_eager = False
@ -1414,12 +1418,15 @@ def _enable(compiler_fn, dynamic: bool = True):
torch._C._dynamo.compiled_autograd.set_verbose_logger(verbose_log)
global compiled_autograd_enabled
compiled_autograd_enabled = True
global nested_level
nested_level += 1
try:
with torch.autograd.set_multithreading_enabled(False):
yield
finally:
if not prior_compiler:
compiled_autograd_enabled = False
nested_level -= 1
torch._C._dynamo.compiled_autograd.set_autograd_compiler(
prior_compiler, prior_dynamic
)

View File

@ -810,7 +810,12 @@ class OptimizeContext(_TorchDynamoContext):
if _dynamic is None:
_dynamic = not torch._dynamo.config.assume_static_by_default
def noop():
pass
def call_compiled_autograd():
if torch._dynamo.compiled_autograd.nested_level > 0:
return noop
assert rebuild_ctx is not None
compiler_fn = rebuild_ctx()
ctx = torch._dynamo.compiled_autograd._enable(

View File

@ -2470,6 +2470,7 @@ def _clone_input(value, fake_mode):
)
or value.is_nested
):
breakpoint()
# NB: ensure strides are preserved
value = clone_input(value)
@ -3257,6 +3258,7 @@ def wrap_to_fake_tensor_and_record(
type(e),
)
breakpoint()
fake_e = wrap_fake_exception(
lambda: tx.fake_mode.from_tensor(
e,

View File

@ -138,14 +138,15 @@ def _autograd_grad(
diff_outputs, grad_outputs = zip(*result)
if len(diff_outputs) == 0:
return tuple(torch.zeros_like(inp) for inp in inputs)
grad_inputs = torch.autograd.grad(
diff_outputs,
inputs,
grad_outputs,
retain_graph=retain_graph,
create_graph=create_graph,
allow_unused=True,
)
with torch._dynamo.compiled_autograd._disable():
grad_inputs = torch.autograd.grad(
diff_outputs,
inputs,
grad_outputs,
retain_graph=retain_graph,
create_graph=create_graph,
allow_unused=True,
)
grad_inputs = tuple(
torch.zeros_like(inp) if gi is None else gi
for gi, inp in zip(grad_inputs, inputs)

View File

@ -218,7 +218,12 @@ class BaseHOPFunction(torch.autograd.Function):
kwargs = ctx.kwargs
# TODO: Something special needs to happen with min cut partitioner
with suspend_functionalization(), disable_functional_mode(), torch.enable_grad():
with (
suspend_functionalization(),
disable_functional_mode(),
torch.enable_grad(),
torch._dynamo.compiled_autograd._disable(),
):
with disable_proxy_modes_tracing():
from .invoke_subgraph import create_fw_bw_graph
from .utils import _from_fun

View File

@ -207,7 +207,7 @@ def trace_joint_graph(fn, fw_inputs, fw_outputs):
# TODO (@anijain2305) - Delete this function when base_hop uses invoke_subgraph infra
def create_fw_bw_graph(subgraph, operands, grad_outputs=None):
with suspend_functionalization(), disable_functional_mode():
with disable_proxy_modes_tracing():
with disable_proxy_modes_tracing(), torch._dynamo.compiled_autograd._disable():
# args are functional tensors, generate some example tensors
fw_inputs = pytree.tree_map(_from_fun, operands)

View File

@ -46,7 +46,7 @@ def create_fw_bw_graph(f, num_mapped_args, *args):
# See Note [HOP create fw_bw graph] in create_fw_bw_graph in utils.py
with suspend_functionalization(), disable_functional_mode():
with suspend_functionalization(), disable_functional_mode(), torch._dynamo.compiled_autograd._disable():
with disable_proxy_modes_tracing():
unwrapped_mapped_xs = pytree.tree_map(_from_fun, mapped_xs)
example_xs = _unstack_pytree(unwrapped_mapped_xs)[0]

View File

@ -767,9 +767,14 @@ def _check_analytical_jacobian_attributes(
diff_input_list = list(_iter_tensors(inputs, True))
def vjp_fn(grad_output):
return torch.autograd.grad(
output, diff_input_list, grad_output, retain_graph=True, allow_unused=True
)
with torch._dynamo.compiled_autograd._disable():
return torch.autograd.grad(
output,
diff_input_list,
grad_output,
retain_graph=True,
allow_unused=True,
)
# Compute everything twice to check for nondeterminism (which we call reentrancy)
if fast_mode:
@ -1969,6 +1974,13 @@ def gradcheck(
fast_mode: bool = False,
masked: Optional[bool] = None,
) -> bool: # noqa: D400,D205
if torch._dynamo.config.compiled_autograd:
if not check_backward_ad:
return True
check_batched_grad = False
check_batched_forward_grad = False
check_forward_ad = False
check_undefined_grad = False
r"""Check gradients computed via small finite differences against analytical
gradients wrt tensors in :attr:`inputs` that are of floating point or complex type
and with ``requires_grad=True``.
@ -2135,6 +2147,10 @@ def gradgradcheck(
fast_mode: bool = False,
masked: bool = False,
) -> bool: # noqa: D400,D205
if torch._dynamo.config.compiled_autograd:
check_undefined_grad = False
check_batched_grad = False
check_undefined_grad = True
r"""Check gradients of gradients computed via small finite differences
against analytical gradients wrt tensors in :attr:`inputs` and
:attr:`grad_outputs` that are of floating point or complex type and with

View File

@ -226,6 +226,7 @@ struct TORCH_API Node : std::enable_shared_from_this<Node> {
}
const InputMetadata& input_metadata(size_t index) const {
std::cout << "index=" << index << ", size=" << input_metadata_.size() << std::endl;
return input_metadata_[index];
}

View File

@ -97,8 +97,9 @@ struct TORCH_API InputMetadata {
// Danger: not thread safe, caller must protect with lock
SymIntSmallVec& mutable_shape_as_dim_vector();
private:
at::Tensor shape_as_tensor() const;
private:
bool is_nestedness_same(const at::Tensor& grad) const;
bool maybe_expandable_to(const at::Tensor& grad) const;

View File

@ -564,17 +564,34 @@ class CompiledNodeArgs {
collect_hooks_from(t.node.get());
}
void collect(const Edge& t) {
std::cout << "collecting edge start" << std::endl;
if (cond(t.is_valid())) {
collect_size(_compiler.node_calls.lookup(t.function).id);
collect_size(t.input_nr);
std::cout << "collecting edge input metadata" << std::endl;
collect(t.function->input_metadata(t.input_nr)); // for validate_outputs
}
std::cout << "collecting edge done" << std::endl;
}
void collect(const InputMetadata& t) {
TORCH_CHECK(!t.is_nested_tensor(), "NestedTensor not implemented");
// TORCH_CHECK(!t.is_nested_tensor(), "NestedTensor not implemented");
std::cout << "collect options" << std::endl;
collect(t.options());
std::cout << "collect subclass" << std::endl;
collect(t.is_tensor_subclass());
collect(t.shape_as_dim_vector());
std::cout << "collect shape" << std::endl;
// need to collect
// 1. nestedness
// 2. shapes to pass in and reconstruct... or fk dynamic shapes
// nested tensors store their shape as a ... tensor?
// and the values matter.
// we dynamic it by just not collecting
// should be safe?
if (t.is_nested_tensor()) {
// t.shape_as_tensor();
} else {
collect(t.shape_as_dim_vector());
}
}
void collect(const VariableInfo& t) {
collect(t.layout);
@ -1448,10 +1465,15 @@ struct IValuePacker<at::TensorGeometry> {
template <>
struct IValuePacker<InputMetadata> {
static at::IValue pack(const InputMetadata& t) {
TORCH_INTERNAL_ASSERT(!t.is_nested_tensor());
if (!t.is_nested_tensor()) {
auto input_shape = t.shape_as_dim_vector().vec();
} else {
}
auto tuple = std::make_tuple(
pack_TensorOptions(t.options()),
t.shape_as_dim_vector().vec(),
input_shape,
t.is_tensor_subclass());
return tuple;
}

View File

@ -1,5 +1,6 @@
#include <torch/csrc/dynamo/python_compiled_autograd.h>
#include <ATen/LegacyVmapMode.h>
#include <torch/csrc/autograd/engine.h>
#include <torch/csrc/autograd/functions/accumulate_grad.h>
#include <torch/csrc/autograd/python_function.h>
@ -918,7 +919,9 @@ static CacheNode* _compiled_autograd_impl(
}
node_args.collect(call);
if (node_args.cond(call.needed)) {
std::cout << "compiled_args on " << fn->name() << std::endl;
fn->compiled_args(node_args);
std::cout << "next edges on " << fn->name() << std::endl;
node_args.collect(call.node->next_edges());
}
CacheKey key = node_args.key();
@ -936,6 +939,7 @@ static CacheNode* _compiled_autograd_impl(
cache = cache->lookup(key);
}
std::cout << "collecting edges on " << fn->name() << std::endl;
for (const auto& edge : fn->next_edges()) {
if (!edge.is_valid()) {
continue;
@ -1191,6 +1195,9 @@ static variable_list compiled_autograd(
TORCH_CHECK(
c10::impl::TorchDispatchModeTLS::stack_len() == 0,
"TorchDispatchMode not yet implemented for compiled autograd")
TORCH_CHECK(
at::impl::VmapMode::current_vmap_level() == 0,
"torch.vmap not yet implemented for compiled autograd")
static std::mutex mtx;
LockGuardWithErrorLogs lock_guard(mtx);
pybind11::gil_scoped_acquire gil;

View File

@ -2271,6 +2271,7 @@ def make_fx(
record_module_stack: bool = False,
_allow_fake_constant: bool = False,
_error_on_data_dependent_ops: bool = True,
_disable_compiled_autograd: bool = True,
) -> Callable[..., GraphModule]:
"""
Given a function f, return a new function which when executed with valid
@ -2290,9 +2291,18 @@ def make_fx(
_error_on_data_dependent_ops,
)
@functools.wraps(f)
def wrapped(*args: object) -> GraphModule:
return make_fx_tracer.trace(f, *args)
if _disable_compiled_autograd:
@functools.wraps(f)
def wrapped(*args: object) -> GraphModule:
with torch._dynamo.compiled_autograd._disable():
return make_fx_tracer.trace(f, *args)
else:
@functools.wraps(f)
def wrapped(*args: object) -> GraphModule:
return make_fx_tracer.trace(f, *args)
return wrapped

View File

@ -11,6 +11,7 @@ import argparse
import contextlib
import copy
import ctypes
from pathlib import PosixPath
import errno
import functools
import gc
@ -1555,6 +1556,10 @@ TEST_WITH_TORCHDYNAMO: bool = TestEnvironment.def_flag(
env_var="PYTORCH_TEST_WITH_DYNAMO",
implied_by_fn=lambda: TEST_WITH_TORCHINDUCTOR or TEST_WITH_AOT_EAGER,
)
TEST_DISABLE_CA: bool = TestEnvironment.def_flag(
"TEST_DISABLE_CA",
env_var="PYTORCH_DISABLE_CA",
)
if TEST_WITH_TORCHDYNAMO:
import torch._dynamo
@ -1567,6 +1572,8 @@ if TEST_WITH_TORCHDYNAMO:
if TEST_WITH_TORCHINDUCTOR:
import torch._inductor.config
torch._inductor.config.fallback_random = True
else:
torch._dynamo.config.compiled_autograd = not TEST_DISABLE_CA
# seems like this is only used in test/torch_np
@ -3308,11 +3315,25 @@ class TestCase(expecttest.TestCase):
assert result.wasSuccessful() is False
result.stop()
def skip_test(self, result):
bad_paths = ["test_jit.py", "test_jit_fuser_te.py"]
if path := getattr(result, "path", None):
if isinstance(path, PosixPath):
path_str = path.as_posix()
print(f"skipping CA test {path_str=}")
for bad_path in bad_paths:
if bad_path in path_str:
return True
return False
def run(self, result=None):
with contextlib.ExitStack() as stack:
if TEST_WITH_CROSSREF:
stack.enter_context(CrossRefMode())
if torch._dynamo.compiled_autograd and self.skip_test(result):
return
self._run_custom(
result=result,
)
@ -5311,6 +5332,9 @@ class TestGradients(TestCase):
def _check_helper(self, device, dtype, op, variant, check, *, check_forward_ad=False, check_backward_ad=True,
check_batched_grad=None, check_batched_forward_grad=False):
if torch._dynamo.config.compiled_autograd:
check_batched_grad = False
check_batched_forward_grad = False
assert check in ('gradcheck', 'bwgrad_bwgrad', 'fwgrad_bwgrad')
# NB: check_backward_ad does not affect gradgradcheck (always True)
if variant is None:

View File

@ -316,7 +316,8 @@ class CheckpointFunction(torch.autograd.Function):
"none of output has requires_grad=True,"
" this checkpoint() is not necessary"
)
torch.autograd.backward(outputs_with_grad, args_with_grad)
with torch._dynamo.compiled_autograd._disable():
torch.autograd.backward(outputs_with_grad, args_with_grad)
grads = tuple(
inp.grad if isinstance(inp, torch.Tensor) else None
for inp in detached_inputs