Revert "[CI] Add Compiled DDP and Compiled FSDP2 tests to test_inductor_distributed (#138178)"

This reverts commit 8cb91109061648497ca09d6f1f9b9e13a2f5557e. Reverted https://github.com/pytorch/pytorch/pull/138178 on behalf of https://github.com/yf225 due to because https://github.com/pytorch/pytorch/pull/138174 is reverted, we need to revert this too ([comment](https://github.com/pytorch/pytorch/pull/138178#issuecomment-2422961292))
2025-10-20 21:14:14 +08:00 · 2024-10-18 17:51:54 +00:00
parent 59158f640c
commit ada7a8c217
3 changed files with 4 additions and 15 deletions
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -320,7 +320,6 @@ test_inductor_distributed() {
  python test/run_test.py -i distributed/test_c10d_functional_native.py --verbose
  python test/run_test.py -i distributed/_tensor/test_dtensor_compile.py --verbose
  python test/run_test.py -i distributed/tensor/parallel/test_micro_pipeline_tp.py --verbose
-  python test/run_test.py -i distributed/_composable/test_replicate_with_compiler.py --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_comm.py --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing --verbose
@ -332,7 +331,6 @@ test_inductor_distributed() {
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py -k test_clip_grad_norm_2d --verbose
-  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_compile.py --verbose
  python test/run_test.py -i distributed/fsdp/test_fsdp_tp_integration.py -k test_fsdp_tp_integration --verbose

  # this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
--- a/test/distributed/_composable/fsdp/test_fully_shard_compile.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_compile.py
@ -613,8 +613,7 @@ val.shape: {[node.meta['val'].shape for node in aliased_graph_inputs]},
    @skipIfRocm
    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
    def test_nested_fully_shard_backend_aot_eager(self):
-        # TODO: fix fwd_fullgraph=False case
-        for fwd_fullgraph in [True]:
+        for fwd_fullgraph in [True, False]:
            self._test_traceable_fsdp(
                *self._create_nested_fully_shard_factory_fns(
                    fwd_fullgraph=fwd_fullgraph
@ -626,8 +625,7 @@ val.shape: {[node.meta['val'].shape for node in aliased_graph_inputs]},
    @skipIfRocm
    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
    def test_nested_fully_shard_backend_aot_eager_decomp_partition(self):
-        # TODO: fix fwd_fullgraph=False case
-        for fwd_fullgraph in [True]:
+        for fwd_fullgraph in [True, False]:
            self._test_traceable_fsdp(
                *self._create_nested_fully_shard_factory_fns(
                    fwd_fullgraph=fwd_fullgraph
@ -732,7 +730,6 @@ val.shape: {[node.meta['val'].shape for node in aliased_graph_inputs]},
                    )
                file_check.run(bwd_code)

-    @unittest.skip("TODO: fix fwd_fullgraph=False case")
    @skipIfRocm
    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
    def test_nested_fully_shard_backend_inductor_fullgraph_False(self):
@ -813,9 +810,8 @@ val.shape: {[node.meta['val'].shape for node in aliased_graph_inputs]},
    @skipIfRocm
    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
    def test_transformer_backend_aot_eager(self):
-        # TODO: fix fwd_fullgraph=False case
        for fwd_fullgraph, all_requires_grad in itertools.product(
-            [True], [True, False]
+            [True, False], [True, False]
        ):
            with self._maybe_add_graph_break_to_sdpa(
                fwd_fullgraph
@ -833,9 +829,8 @@ val.shape: {[node.meta['val'].shape for node in aliased_graph_inputs]},
    # TODO: native_dropout has worse accuracy after decomp, need to figure out why
    @torch._inductor.config.patch(fallback_random=True)
    def test_transformer_backend_aot_eager_decomp_partition(self):
-        # TODO: fix fwd_fullgraph=False case
        for fwd_fullgraph, all_requires_grad in itertools.product(
-            [True], [True, False]
+            [True, False], [True, False]
        ):
            with self._maybe_add_graph_break_to_sdpa(fwd_fullgraph):
                self._test_traceable_fsdp(
@ -951,7 +946,6 @@ val.shape: {[node.meta['val'].shape for node in aliased_graph_inputs]},
                        )
                file_check.run(bwd_code)

-    @unittest.skip("TODO: fix fwd_fullgraph=False case")
    @skipIfRocm
    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
    # TODO: native_dropout causes CUDA IMA error, need to figure out why
--- a/test/distributed/_composable/test_replicate_with_compiler.py
+++ b/test/distributed/_composable/test_replicate_with_compiler.py
@ -385,9 +385,6 @@ class DDP_TP_Test(InductorTestCase):
    def tearDown(self):
        dist.destroy_process_group()

-    @unittest.skip(
-        "Temporarily disabled due to SymInt error: `unhashable type: non-nested SymInt`"
-    )
    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
    @skipIfRocm
    def test_ddp_tp(self):