mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-07 01:50:04 +08:00
Revert "Add Triton CPU as an Inductor backend (#133408)"
This reverts commit e498b02b472e45cfd6b7a08db0d6c1babec655c5. Reverted https://github.com/pytorch/pytorch/pull/133408 on behalf of https://github.com/jeanschmidt due to Broke internal signals, see D62737208 for more details ([comment](https://github.com/pytorch/pytorch/pull/133408#issuecomment-2353623816))
This commit is contained in:
@ -46,7 +46,7 @@ from torch.testing._internal.common_distributed import (
|
||||
skip_if_lt_x_gpu,
|
||||
)
|
||||
from torch.testing._internal.common_utils import requires_cuda
|
||||
from torch.testing._internal.inductor_utils import HAS_GPU
|
||||
from torch.utils._triton import has_triton
|
||||
|
||||
|
||||
def reset_rng_state():
|
||||
@ -325,7 +325,7 @@ def run_hf_bert_ddp(self, model, inputs, backend):
|
||||
|
||||
|
||||
class TestFakeDistributedSingleProc(torch._dynamo.test_case.TestCase):
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
@patch.object(config, "optimize_ddp", True)
|
||||
@patch.object(torch._inductor.config, "fallback_random", True)
|
||||
def test_hf_bert_ddp_inductor(self):
|
||||
@ -528,7 +528,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@import_transformers_or_skip()
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
@config.patch(optimize_ddp=True, enable_compiler_collectives=True)
|
||||
@patch.object(torch._inductor.config, "fallback_random", True)
|
||||
def test_hf_bert_ddp_inductor(self):
|
||||
@ -536,7 +536,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@import_transformers_or_skip()
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
@config.patch(optimize_ddp=True, enable_compiler_collectives=True)
|
||||
@patch.object(torch._inductor.config, "fallback_random", True)
|
||||
def test_hf_bert_ddp_inductor_static_graph(self):
|
||||
@ -561,7 +561,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
|
||||
self._test_hf_bert_aot_eager(static_graph=True)
|
||||
|
||||
@skip_if_lt_x_gpu(2)
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
@config.patch(optimize_ddp=False, enable_compiler_collectives=True)
|
||||
def test_ddp_activation_checkpointing(self):
|
||||
from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
|
||||
@ -676,7 +676,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
|
||||
|
||||
@config.patch(enable_compiler_collectives=True)
|
||||
@skip_if_lt_x_gpu(1)
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
def test_fsdp_inductor(self):
|
||||
with _dynamo_dist_per_rank_init(self.rank, self.world_size):
|
||||
# Test with basic FSDP wrapping (outer wrap around whole model)
|
||||
@ -701,7 +701,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
|
||||
|
||||
@config.patch(enable_compiler_collectives=True)
|
||||
@skip_if_lt_x_gpu(1)
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
def test_fsdp_activation_checkpointing(self):
|
||||
with _dynamo_dist_per_rank_init(self.rank, self.world_size):
|
||||
model, inputs = get_toy_model_for_activation_checkpointing(
|
||||
@ -722,7 +722,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
|
||||
)
|
||||
|
||||
@import_transformers_or_skip()
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
# TODO(whc) Investigate why cudagraphs breaks inductor+fsdp for hf_bert
|
||||
@patch.object(torch._inductor.config.triton, "cudagraphs", False)
|
||||
@patch.object(torch._inductor.config, "fallback_random", True)
|
||||
@ -767,7 +767,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
|
||||
self.assertTrue(same(correct_results, opt_results))
|
||||
|
||||
@import_transformers_or_skip()
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
# TODO(whc) Investigate why cudagraphs breaks inductor+fsdp for hf_bert
|
||||
@patch.object(torch._inductor.config.triton, "cudagraphs", False)
|
||||
@patch.object(torch._inductor.config, "fallback_random", True)
|
||||
@ -815,7 +815,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
|
||||
)
|
||||
self.assertTrue(same(correct_results, opt_results))
|
||||
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
@config.patch(enable_compiler_collectives=True)
|
||||
def test_compiler_collectives_automatic_dynamic_tensor(self):
|
||||
with _dynamo_dist_per_rank_init(self.rank, self.world_size):
|
||||
@ -860,7 +860,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
|
||||
for r in res[1:]:
|
||||
self.assertEqual(res[0], r)
|
||||
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
@config.patch(enable_compiler_collectives=True)
|
||||
def test_compiler_collectives_automatic_dynamic_scalar(self):
|
||||
with _dynamo_dist_per_rank_init(self.rank, self.world_size):
|
||||
@ -888,7 +888,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
|
||||
for r in res[1:]:
|
||||
self.assertEqual(res[0], r)
|
||||
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
@config.patch(enable_compiler_collectives=True)
|
||||
def test_compiler_collectives_automatic_dynamic_speculation_divergence(self):
|
||||
with _dynamo_dist_per_rank_init(self.rank, self.world_size):
|
||||
@ -921,7 +921,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
|
||||
for r in res[1:]:
|
||||
self.assertEqual(res[0], r)
|
||||
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
@config.patch(enable_compiler_collectives=True)
|
||||
def test_compiler_collectives_graph_break_empty_graph_still_collective(self):
|
||||
with _dynamo_dist_per_rank_init(self.rank, self.world_size):
|
||||
@ -955,7 +955,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
|
||||
for r in res[1:]:
|
||||
self.assertEqual(res[0], r)
|
||||
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
@config.patch(enable_compiler_collectives=True)
|
||||
def test_compiler_collectives_dim_mismatch(self):
|
||||
with _dynamo_dist_per_rank_init(self.rank, self.world_size):
|
||||
@ -984,7 +984,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
|
||||
for r in res[1:]:
|
||||
self.assertEqual(res[0], r)
|
||||
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
@config.patch(enable_compiler_collectives=True)
|
||||
def test_compiler_collectives_missing_source(self):
|
||||
with _dynamo_dist_per_rank_init(self.rank, self.world_size):
|
||||
@ -1006,7 +1006,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
|
||||
for r in res[1:]:
|
||||
self.assertEqual(res[0], r)
|
||||
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
@config.patch(enable_compiler_collectives=True)
|
||||
def test_compiler_collectives_scalar_missing_source(self):
|
||||
with _dynamo_dist_per_rank_init(self.rank, self.world_size):
|
||||
@ -1028,7 +1028,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
|
||||
for r in res[1:]:
|
||||
self.assertEqual(res[0], r)
|
||||
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
@config.patch(enable_compiler_collectives=True)
|
||||
def test_compiler_collectives_type_mismatch(self):
|
||||
with _dynamo_dist_per_rank_init(self.rank, self.world_size):
|
||||
@ -1062,7 +1062,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
|
||||
for r in res[1:]:
|
||||
self.assertEqual(res[0], r)
|
||||
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
@patch.object(torch._inductor.config, "fx_graph_cache", False)
|
||||
@patch.object(torch._inductor.config, "fx_graph_remote_cache", False)
|
||||
def test_asymmetric_compilation(self):
|
||||
@ -1113,7 +1113,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase):
|
||||
for r in res[1:]:
|
||||
self.assertEqual(res[0], r)
|
||||
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
@patch.object(torch._inductor.config, "fx_graph_cache", True)
|
||||
@patch.object(torch._inductor.config, "fx_graph_remote_cache", False)
|
||||
@patch.object(torch._inductor.config, "sleep_sec_TESTING_ONLY", 10)
|
||||
@ -1203,7 +1203,7 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase):
|
||||
outputs = ddp_m(inputs)
|
||||
self.assertTrue(same(correct_outputs, outputs))
|
||||
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
@patch.object(config, "optimize_ddp", False)
|
||||
def test_ddp_baseline_inductor(self):
|
||||
from torch.nn.parallel import DistributedDataParallel as DDP
|
||||
@ -1299,7 +1299,7 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase):
|
||||
self.assertTrue(all("DDPOptimizer" in r.reason for r in break_reasons))
|
||||
|
||||
@patch.object(config, "optimize_ddp", True)
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
def test_graph_split_inductor(self):
|
||||
assert config.optimize_ddp
|
||||
"""
|
||||
@ -1368,18 +1368,18 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase):
|
||||
opt_outputs = opt_fn(inputs)
|
||||
self.assertTrue(same(correct_outputs, opt_outputs))
|
||||
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
def test_graph_split_inductor_layout_optimizations_training(self):
|
||||
self._test_graph_split_inductor_layout_optimizations_impl(
|
||||
contextlib.nullcontext
|
||||
)
|
||||
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
def test_graph_split_inductor_layout_optimizations_inference(self):
|
||||
self._test_graph_split_inductor_layout_optimizations_impl(torch.no_grad)
|
||||
|
||||
@patch.object(config, "optimize_ddp", True)
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
def test_graph_split_inductor_transpose(self):
|
||||
assert config.optimize_ddp
|
||||
|
||||
@ -1470,7 +1470,7 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase):
|
||||
self.assertTrue(same(correct_outputs, opt_outputs))
|
||||
self.assertEqual(check_splits_compiler.compiler_called, 3)
|
||||
|
||||
@unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
|
||||
@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch")
|
||||
def test_empty_graph_inductor(self):
|
||||
def fn():
|
||||
get_world_size = torch.distributed.distributed_c10d.get_world_size()
|
||||
|
||||
Reference in New Issue
Block a user