From 36428f91e969dd27c6f2aa2665da3e5180ba4deb Mon Sep 17 00:00:00 2001 From: PyTorch MergeBot Date: Fri, 27 Sep 2024 16:54:27 +0000 Subject: [PATCH] Revert "Add Triton CPU as an Inductor backend (#133408)" This reverts commit 31c0467594c7c41c8e8ff1828bf01fa31fc4454f. Reverted https://github.com/pytorch/pytorch/pull/133408 on behalf of https://github.com/int3 due to internal tests failing ([comment](https://github.com/pytorch/pytorch/pull/133408#issuecomment-2379692517)) --- .ci/docker/build.sh | 1 - .../fsdp/test_fully_shard_compile.py | 26 +-- .../fully_shard/test_fully_shard_compile.py | 4 +- .../test_replicate_with_compiler.py | 14 +- .../_tensor/test_dtensor_compile.py | 10 +- .../fsdp/test_fsdp_use_orig_params.py | 4 +- .../tensor/parallel/test_micro_pipeline_tp.py | 18 +- .../test_c10d_functional_native.py | 28 +-- .../test_compute_comm_reordering.py | 14 +- test/distributed/test_dynamo_distributed.py | 50 ++--- test/distributed/test_functional_api.py | 8 +- test/distributed/test_inductor_collectives.py | 38 ++-- test/inductor/test_cuda_repro.py | 4 +- test/inductor/test_memory_planning.py | 3 +- test/inductor/test_triton_cpu_backend.py | 34 ---- test/test_sparse_semi_structured.py | 8 +- torch/_dynamo/device_interface.py | 49 ----- torch/_inductor/autotune_process.py | 72 ++++--- torch/_inductor/codegen/common.py | 7 +- torch/_inductor/codegen/cpp.py | 2 +- .../_inductor/codegen/cpp_template_kernel.py | 4 +- .../codegen/cpu_device_op_overrides.py | 26 --- torch/_inductor/codegen/halide.py | 1 - .../codegen/rocm/rocm_benchmark_request.py | 8 +- torch/_inductor/codegen/triton.py | 7 +- torch/_inductor/codegen/wrapper.py | 184 +++++++++--------- torch/_inductor/config.py | 2 +- torch/_inductor/runtime/benchmarking.py | 2 +- torch/_inductor/runtime/triton_helpers.py | 28 --- torch/_inductor/runtime/triton_heuristics.py | 13 -- torch/_inductor/scheduler.py | 3 +- torch/_inductor/select_algorithm.py | 17 +- torch/_inductor/utils.py | 8 +- torch/utils/_triton.py | 16 +- 34 files changed, 258 insertions(+), 455 deletions(-) delete mode 100644 test/inductor/test_triton_cpu_backend.py delete mode 100644 torch/_inductor/codegen/cpu_device_op_overrides.py diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index b46626f9c3be..14a44b3d1702 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -379,7 +379,6 @@ case "$image" in GCC_VERSION=11 CONDA_CMAKE=yes HALIDE=yes - TRITON=yes ;; pytorch-linux-focal-linter) # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627. diff --git a/test/distributed/_composable/fsdp/test_fully_shard_compile.py b/test/distributed/_composable/fsdp/test_fully_shard_compile.py index 7257f6a1e98b..ecd1e7a6d9aa 100644 --- a/test/distributed/_composable/fsdp/test_fully_shard_compile.py +++ b/test/distributed/_composable/fsdp/test_fully_shard_compile.py @@ -30,7 +30,7 @@ from torch.testing._internal.distributed._tensor.common_dtensor import ( ModelArgs, Transformer, ) -from torch.testing._internal.inductor_utils import HAS_GPU +from torch.utils._triton import has_triton log = logging.getLogger(__name__) @@ -48,7 +48,7 @@ orig_F_scaled_dot_product_attention = F.scaled_dot_product_attention class TestFullyShardCompileCompute(FSDPTest): - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @skip_if_lt_x_gpu(2) def test_disable_compiling_hooks(self): self.run_subtests( @@ -529,14 +529,14 @@ val.shape: {[node.meta['val'].shape for node in aliased_graph_inputs]}, return model_init_fn, input_creation_fn @skipIfRocm - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") def test_simple_mlp_fullgraph_backend_aot_eager(self): self._test_traceable_fsdp( *self._create_simple_mlp_factory_fns(), "aot_eager", fullgraph=True ) @skipIfRocm - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") def test_simple_mlp_fullgraph_backend_aot_eager_decomp_partition(self): self._test_traceable_fsdp( *self._create_simple_mlp_factory_fns(), @@ -545,7 +545,7 @@ val.shape: {[node.meta['val'].shape for node in aliased_graph_inputs]}, ) @skipIfRocm - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") def test_simple_mlp_fullgraph_backend_inductor(self): self._test_traceable_fsdp( *self._create_simple_mlp_factory_fns(), "inductor", fullgraph=True @@ -613,7 +613,7 @@ val.shape: {[node.meta['val'].shape for node in aliased_graph_inputs]}, return model_init_fn, input_creation_fn @skipIfRocm - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") def test_nested_fully_shard_backend_aot_eager(self): for fullgraph in [True, False]: self._test_traceable_fsdp( @@ -623,7 +623,7 @@ val.shape: {[node.meta['val'].shape for node in aliased_graph_inputs]}, ) @skipIfRocm - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") def test_nested_fully_shard_backend_aot_eager_decomp_partition(self): for fullgraph in [True, False]: self._test_traceable_fsdp( @@ -633,7 +633,7 @@ val.shape: {[node.meta['val'].shape for node in aliased_graph_inputs]}, ) @skipIfRocm - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") def test_nested_fully_shard_backend_inductor_fullgraph_True(self): for fullgraph in [True]: with self._reinplace_all_gather_with_optional_checks( @@ -729,7 +729,7 @@ val.shape: {[node.meta['val'].shape for node in aliased_graph_inputs]}, file_check.run(bwd_code) @skipIfRocm - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") def test_nested_fully_shard_backend_inductor_fullgraph_False(self): _, triton_codes = run_and_get_code( lambda: self._test_traceable_fsdp( @@ -806,7 +806,7 @@ val.shape: {[node.meta['val'].shape for node in aliased_graph_inputs]}, return contextlib.nullcontext() @skipIfRocm - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") def test_transformer_backend_aot_eager(self): for fullgraph, all_requires_grad in itertools.product( [True, False], [True, False] @@ -823,7 +823,7 @@ val.shape: {[node.meta['val'].shape for node in aliased_graph_inputs]}, ) @skipIfRocm - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") # TODO: native_dropout has worse accuracy after decomp, need to figure out why @torch._inductor.config.patch(fallback_random=True) def test_transformer_backend_aot_eager_decomp_partition(self): @@ -840,7 +840,7 @@ val.shape: {[node.meta['val'].shape for node in aliased_graph_inputs]}, ) @skipIfRocm - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") # TODO: native_dropout causes CUDA IMA error, need to figure out why @torch._inductor.config.patch(fallback_random=True) def test_transformer_backend_inductor_fullgraph_True(self): @@ -943,7 +943,7 @@ val.shape: {[node.meta['val'].shape for node in aliased_graph_inputs]}, file_check.run(bwd_code) @skipIfRocm - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") # TODO: native_dropout causes CUDA IMA error, need to figure out why @torch._inductor.config.patch(fallback_random=True) def test_transformer_backend_inductor_fullgraph_False(self): diff --git a/test/distributed/_composable/fully_shard/test_fully_shard_compile.py b/test/distributed/_composable/fully_shard/test_fully_shard_compile.py index 5c48e3f31f6b..6e359d3fc61a 100644 --- a/test/distributed/_composable/fully_shard/test_fully_shard_compile.py +++ b/test/distributed/_composable/fully_shard/test_fully_shard_compile.py @@ -18,7 +18,7 @@ from torch.testing._internal.common_fsdp import ( TransformerWithSharedParams, ) from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN -from torch.testing._internal.inductor_utils import HAS_GPU +from torch.utils._triton import has_triton if not dist.is_available(): @@ -38,7 +38,7 @@ class TestCompile(FSDPTest): def world_size(self) -> int: return torch.cuda.device_count() - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @skip_if_lt_x_gpu(2) def test_compile(self): self.run_subtests( diff --git a/test/distributed/_composable/test_replicate_with_compiler.py b/test/distributed/_composable/test_replicate_with_compiler.py index 03dc74b113ce..e0df5aaff27f 100644 --- a/test/distributed/_composable/test_replicate_with_compiler.py +++ b/test/distributed/_composable/test_replicate_with_compiler.py @@ -33,7 +33,7 @@ from torch.testing._internal.common_distributed import ( ) from torch.testing._internal.common_utils import run_tests, skipIfRocm from torch.testing._internal.distributed.fake_pg import FakeStore -from torch.testing._internal.inductor_utils import HAS_GPU +from torch.utils._triton import has_triton from torch.utils.checkpoint import checkpoint @@ -216,21 +216,21 @@ class ReplicateTest(MultiProcessInductorTestCase): ] self._test_compile(use_gpu=False, no_sync=True) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @skip_if_rocm_multiprocess @skip_if_lt_x_gpu(2) @torch._inductor.config.patch(reorder_for_locality=False) def test_compile_gpu(self): self._test_compile(use_gpu=True, no_sync=False, checkpoint=False) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @skip_if_rocm_multiprocess @skip_if_lt_x_gpu(2) @torch._inductor.config.patch(reorder_for_locality=False) def test_compile_gpu_ac(self): self._test_compile(use_gpu=True, no_sync=False, checkpoint=True) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @skip_if_rocm_multiprocess @skip_if_lt_x_gpu(2) def test_compile_bf16(self): @@ -244,7 +244,7 @@ class ReplicateTest(MultiProcessInductorTestCase): self._test_compile(use_gpu=True, no_sync=False, setup_func=setup) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @skip_if_rocm_multiprocess @skip_if_lt_x_gpu(2) def test_compile_fp16(self): @@ -261,7 +261,7 @@ class ReplicateTest(MultiProcessInductorTestCase): use_gpu=True, no_sync=False, setup_func=setup, no_inductor=True ) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @skip_if_rocm_multiprocess @skip_if_lt_x_gpu(2) def test_compile_backward_only(self): @@ -385,7 +385,7 @@ class DDP_TP_Test(InductorTestCase): def tearDown(self): dist.destroy_process_group() - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @skipIfRocm def test_ddp_tp(self): ref_model = Net() diff --git a/test/distributed/_tensor/test_dtensor_compile.py b/test/distributed/_tensor/test_dtensor_compile.py index a32cb2a8ae56..3f4ddfce7813 100644 --- a/test/distributed/_tensor/test_dtensor_compile.py +++ b/test/distributed/_tensor/test_dtensor_compile.py @@ -46,7 +46,7 @@ from torch.testing._internal.distributed._tensor.common_dtensor import ( with_comms, ) from torch.testing._internal.distributed.fake_pg import FakeStore -from torch.testing._internal.inductor_utils import HAS_GPU +from torch.utils._triton import has_triton from torch.utils.checkpoint import checkpoint @@ -439,7 +439,7 @@ class TestDTensorCompile(torch._dynamo.test_case.TestCase): tmp_dt._local_tensor.stride(), tmp_dt_fake._local_tensor.stride() ) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") def test_dtensor_contiguous_dtensor_noncontiguous_local_as_tangent(self): # Partial -> Shard on an unbalanced tensor results in: # - A contiguous DTensor @@ -515,7 +515,7 @@ class TestDTensorCompile(torch._dynamo.test_case.TestCase): out_test = opt_mod(dt) self.assertEqual(out_ref, out_test) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") def test_dtensor_different_gradient_placement(self): mesh = DeviceMesh(self.device_type, torch.arange(self.world_size)) @@ -647,7 +647,7 @@ def forward(self, primals_1): return (sin_1, primals_1, wait_tensor)""", ) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") def test_dtensor_partial_placement_graph_output(self): mesh = DeviceMesh(self.device_type, torch.arange(self.world_size)) @@ -665,7 +665,7 @@ def forward(self, primals_1): out_dt = torch.matmul(tmp_dt, y_dt) out_dt.sum().backward() - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @skip_if_lt_x_gpu(1) # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor @patch.object(torch._inductor.config, "compile_threads", 1) diff --git a/test/distributed/fsdp/test_fsdp_use_orig_params.py b/test/distributed/fsdp/test_fsdp_use_orig_params.py index e477c043c4d6..e3863a02e797 100644 --- a/test/distributed/fsdp/test_fsdp_use_orig_params.py +++ b/test/distributed/fsdp/test_fsdp_use_orig_params.py @@ -43,7 +43,7 @@ from torch.testing._internal.common_utils import ( TEST_WITH_DEV_DBG_ASAN, TestCase, ) -from torch.testing._internal.inductor_utils import HAS_GPU +from torch.utils._triton import has_triton if not dist.is_available(): @@ -218,7 +218,7 @@ class TestFSDPUseOrigParamsMultipleParamGroups(FSDPTest): raise ValueError(f"Invalid string: {sharding_strategy_str}") return sharding_strategy - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @skip_if_lt_x_gpu(2) def test_fsdp_compile(self): self.run_subtests( diff --git a/test/distributed/tensor/parallel/test_micro_pipeline_tp.py b/test/distributed/tensor/parallel/test_micro_pipeline_tp.py index 3a4b2eb946fc..951e77188364 100644 --- a/test/distributed/tensor/parallel/test_micro_pipeline_tp.py +++ b/test/distributed/tensor/parallel/test_micro_pipeline_tp.py @@ -37,7 +37,7 @@ from torch.testing._internal.common_utils import ( # type: ignore[attr-defined] ) from torch.testing._internal.distributed._tensor.common_dtensor import MLPModule from torch.testing._internal.distributed.fake_pg import FakeStore -from torch.testing._internal.inductor_utils import HAS_GPU +from torch.utils._triton import has_triton def _make_post_grad_fx(f, *inps): @@ -78,7 +78,7 @@ class MicroPipelineTPTest(TestCase): def tearDown(self): dist.destroy_process_group() - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @fresh_inductor_cache() def test_find_all_gather_patterns(self): group = dist.group.WORLD @@ -129,7 +129,7 @@ class MicroPipelineTPTest(TestCase): torch.ops.aten.view.dtype, ) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @fresh_inductor_cache() def test_find_reduce_scatter_patterns(self): group = dist.group.WORLD @@ -168,7 +168,7 @@ class MicroPipelineTPTest(TestCase): self.assertEqual(reduce_scatters[1].reduce_op, "avg") self.assertEqual(reduce_scatters[1].scatter_dim, 1) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @fresh_inductor_cache() def test_get_unexposed_collectives(self): group = dist.group.WORLD @@ -193,7 +193,7 @@ class MicroPipelineTPTest(TestCase): ["all_gather_into_tensor", "reduce_scatter_tensor"], ) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @parametrize("A_dims", [2, 3]) @parametrize("gather_dim", [0, 1, 2]) @fresh_inductor_cache() @@ -231,7 +231,7 @@ class MicroPipelineTPTest(TestCase): self.assertNotIn("all_gather_into_tensor", code) @runOnRocmArch(MI300_ARCH) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @parametrize("A_dims", [2, 3]) @parametrize("gather_dim", [0, 1, 2]) @fresh_inductor_cache() @@ -299,7 +299,7 @@ class MicroPipelineTPTest(TestCase): self.assertIn("fused_all_gather_scaled_matmul", code) self.assertNotIn("all_gather_into_tensor", code) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @parametrize("A_dims", [2, 3]) @parametrize("scatter_dim", [0, 1, 2]) @fresh_inductor_cache() @@ -328,7 +328,7 @@ class MicroPipelineTPTest(TestCase): self.assertNotIn("reduce_scatter_tensor", code) @runOnRocmArch(MI300_ARCH) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @parametrize("A_dims", [2, 3]) @parametrize("scatter_dim", [0, 1, 2]) @fresh_inductor_cache() @@ -381,7 +381,7 @@ class MicroPipelineTPTest(TestCase): self.assertIn("fused_scaled_matmul_reduce_scatter", code) self.assertNotIn("reduce_scatter_tensor", code) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @parametrize("shard_dim", [0, 1]) @fresh_inductor_cache() def test_dtensor_seq_par(self, shard_dim: int): diff --git a/test/distributed/test_c10d_functional_native.py b/test/distributed/test_c10d_functional_native.py index 0bea36967b88..619c4b7887f7 100644 --- a/test/distributed/test_c10d_functional_native.py +++ b/test/distributed/test_c10d_functional_native.py @@ -28,7 +28,7 @@ from torch.testing._internal.common_utils import ( # type: ignore[attr-defined] TestCase, ) from torch.testing._internal.distributed.fake_pg import FakeStore -from torch.testing._internal.inductor_utils import HAS_GPU +from torch.utils._triton import has_triton def load_test_module(name): @@ -218,7 +218,7 @@ class TestWithNCCL(MultiProcessTestCase): assert output.eq(expect).all() assert output.completed - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @skip_if_lt_x_gpu(2) # https://github.com/pytorch/pytorch/issues/126338 def test_inductor_dtypeview_memory_leak(self): @@ -434,7 +434,7 @@ class TestWithNCCL(MultiProcessTestCase): torch.ops._c10d_functional.wait_tensor(tensor) self.assertTrue(wait_called) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @skip_if_lt_x_gpu(2) @fresh_inductor_cache() def test_threading(self): @@ -498,7 +498,7 @@ class CompileTest(TestCase): def tearDown(self): dist.destroy_process_group() - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @fresh_inductor_cache() def test_inductor_all_reduce_single(self): def func(arg: torch.Tensor) -> torch.Tensor: @@ -535,7 +535,7 @@ class CompileTest(TestCase): out = AOTIRunnerUtil.run("cuda", func, (arg,)) torch.cuda.synchronize() - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @fresh_inductor_cache() def test_inductor_all_reduce_coalesced(self): def func(args: List[torch.Tensor]) -> torch.Tensor: @@ -581,7 +581,7 @@ class CompileTest(TestCase): out = AOTIRunnerUtil.run("cuda", func, (args,)) torch.cuda.synchronize() - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @fresh_inductor_cache() def test_inductor_inplace_op_on_view(self): def func(arg: torch.Tensor) -> torch.Tensor: @@ -608,7 +608,7 @@ class CompileTest(TestCase): .run(code) ) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @fresh_inductor_cache() def test_inductor_reuse_buffer_after_inplace_collective(self): def func(arg: torch.Tensor) -> torch.Tensor: @@ -643,7 +643,7 @@ class CompileTest(TestCase): ) assert "= torch.ops._c10d_functional.wait_tensor.default" not in code - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @fresh_inductor_cache() def test_inductor_all_gather_into_tensor_single(self): def func(arg: torch.Tensor) -> torch.Tensor: @@ -670,7 +670,7 @@ class CompileTest(TestCase): out = AOTIRunnerUtil.run("cuda", func, (arg,)) torch.cuda.synchronize() - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @fresh_inductor_cache() def test_inductor_all_gather_into_tensor_coalesced(self): def func(args: List[torch.Tensor]) -> torch.Tensor: @@ -704,7 +704,7 @@ class CompileTest(TestCase): out = AOTIRunnerUtil.run("cuda", func, (args,)) torch.cuda.synchronize() - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @fresh_inductor_cache() def test_inductor_reduce_scatter_tensor_single(self): def func(arg: torch.Tensor) -> torch.Tensor: @@ -730,7 +730,7 @@ class CompileTest(TestCase): out = AOTIRunnerUtil.run("cuda", func, (arg,)) torch.cuda.synchronize() - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @fresh_inductor_cache() def test_inductor_reduce_scatter_tensor_coalesced(self): def func(args: List[torch.Tensor]) -> torch.Tensor: @@ -766,7 +766,7 @@ class CompileTest(TestCase): AOTIRunnerUtil.run("cuda", func, (args,)) torch.cuda.synchronize() - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @fresh_inductor_cache() def test_inductor_all_to_all_single(self): def _tolist_with_constrain_as_size(tensor): @@ -814,7 +814,7 @@ class CompileTest(TestCase): .run(code) ) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @fresh_inductor_cache() def test_inductor_broadcast(self): def func(arg: torch.Tensor) -> torch.Tensor: @@ -850,7 +850,7 @@ class CompileTest(TestCase): out = AOTIRunnerUtil.run("cuda", func, (arg,)) torch.cuda.synchronize() - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @fresh_inductor_cache() def test_ranks_and_tag(self): def func(arg: torch.Tensor) -> torch.Tensor: diff --git a/test/distributed/test_compute_comm_reordering.py b/test/distributed/test_compute_comm_reordering.py index bb0e54484b94..db0d6c8becbf 100644 --- a/test/distributed/test_compute_comm_reordering.py +++ b/test/distributed/test_compute_comm_reordering.py @@ -28,7 +28,7 @@ from torch.testing._internal.common_distributed import ( DynamoDistributedMultiProcTestCase, requires_nccl, ) -from torch.testing._internal.inductor_utils import HAS_GPU +from torch.utils._triton import has_triton def get_snode_runtime_for_reorder_compute_test(snode): @@ -92,7 +92,7 @@ class TestComputeCommReorderingMultiProc(DynamoDistributedMultiProcTestCase): # works around issue with skipif<2 and workers with unpredictable #s gpu return 2 - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @patch.object(torch._inductor.config, "allow_buffer_reuse", True) # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor @patch.object(torch._inductor.config, "compile_threads", 1) @@ -131,7 +131,7 @@ class TestComputeCommReorderingMultiProc(DynamoDistributedMultiProcTestCase): correct = func(inputs) self.assertTrue(same(out, correct)) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @patch.object(torch._inductor.config, "allow_buffer_reuse", True) # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor @patch.object(torch._inductor.config, "compile_threads", 1) @@ -178,7 +178,7 @@ class TestComputeCommReorderingMultiProc(DynamoDistributedMultiProcTestCase): correct = func(inputs) self.assertTrue(same(out, correct)) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @patch.object(torch._inductor.config, "allow_buffer_reuse", True) # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor @patch.object(torch._inductor.config, "compile_threads", 1) @@ -231,7 +231,7 @@ class TestComputeCommReorderingMultiProc(DynamoDistributedMultiProcTestCase): correct = func(inputs, **self.get_world_trs()) self.assertTrue(same(out, correct)) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @patch.object(torch._inductor.config, "allow_buffer_reuse", True) # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor @patch.object(torch._inductor.config, "compile_threads", 1) @@ -283,7 +283,7 @@ class TestComputeCommReorderingMultiProc(DynamoDistributedMultiProcTestCase): correct = func(inputs, **self.get_world_trs()) self.assertTrue(same(out, correct)) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @patch.object(torch._inductor.config, "allow_buffer_reuse", True) # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor @patch.object(torch._inductor.config, "compile_threads", 1) @@ -340,7 +340,7 @@ class TestComputeCommReorderingMultiProc(DynamoDistributedMultiProcTestCase): correct = func(inputs, **self.get_world_trs()) self.assertTrue(same(out, correct)) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") # TODO: somehow inductor bg compile threads are causing hangs at exit with distributed work dtor @patch.object(torch._inductor.config, "compile_threads", 1) @patch.object( diff --git a/test/distributed/test_dynamo_distributed.py b/test/distributed/test_dynamo_distributed.py index 1cf98a4d8b40..d6357678c94f 100644 --- a/test/distributed/test_dynamo_distributed.py +++ b/test/distributed/test_dynamo_distributed.py @@ -46,7 +46,7 @@ from torch.testing._internal.common_distributed import ( skip_if_lt_x_gpu, ) from torch.testing._internal.common_utils import requires_cuda -from torch.testing._internal.inductor_utils import HAS_GPU +from torch.utils._triton import has_triton def reset_rng_state(): @@ -325,7 +325,7 @@ def run_hf_bert_ddp(self, model, inputs, backend): class TestFakeDistributedSingleProc(torch._dynamo.test_case.TestCase): - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @patch.object(config, "optimize_ddp", True) @patch.object(torch._inductor.config, "fallback_random", True) def test_hf_bert_ddp_inductor(self): @@ -528,7 +528,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase): @skip_if_lt_x_gpu(2) @import_transformers_or_skip() - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @config.patch(optimize_ddp=True, enable_compiler_collectives=True) @patch.object(torch._inductor.config, "fallback_random", True) def test_hf_bert_ddp_inductor(self): @@ -536,7 +536,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase): @skip_if_lt_x_gpu(2) @import_transformers_or_skip() - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @config.patch(optimize_ddp=True, enable_compiler_collectives=True) @patch.object(torch._inductor.config, "fallback_random", True) def test_hf_bert_ddp_inductor_static_graph(self): @@ -561,7 +561,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase): self._test_hf_bert_aot_eager(static_graph=True) @skip_if_lt_x_gpu(2) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @config.patch(optimize_ddp=False, enable_compiler_collectives=True) def test_ddp_activation_checkpointing(self): from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import ( @@ -676,7 +676,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase): @config.patch(enable_compiler_collectives=True) @skip_if_lt_x_gpu(1) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") def test_fsdp_inductor(self): with _dynamo_dist_per_rank_init(self.rank, self.world_size): # Test with basic FSDP wrapping (outer wrap around whole model) @@ -701,7 +701,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase): @config.patch(enable_compiler_collectives=True) @skip_if_lt_x_gpu(1) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") def test_fsdp_activation_checkpointing(self): with _dynamo_dist_per_rank_init(self.rank, self.world_size): model, inputs = get_toy_model_for_activation_checkpointing( @@ -722,7 +722,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase): ) @import_transformers_or_skip() - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") # TODO(whc) Investigate why cudagraphs breaks inductor+fsdp for hf_bert @patch.object(torch._inductor.config.triton, "cudagraphs", False) @patch.object(torch._inductor.config, "fallback_random", True) @@ -767,7 +767,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase): self.assertTrue(same(correct_results, opt_results)) @import_transformers_or_skip() - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") # TODO(whc) Investigate why cudagraphs breaks inductor+fsdp for hf_bert @patch.object(torch._inductor.config.triton, "cudagraphs", False) @patch.object(torch._inductor.config, "fallback_random", True) @@ -815,7 +815,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase): ) self.assertTrue(same(correct_results, opt_results)) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @config.patch(enable_compiler_collectives=True) def test_compiler_collectives_automatic_dynamic_tensor(self): with _dynamo_dist_per_rank_init(self.rank, self.world_size): @@ -860,7 +860,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase): for r in res[1:]: self.assertEqual(res[0], r) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @config.patch(enable_compiler_collectives=True) def test_compiler_collectives_automatic_dynamic_scalar(self): with _dynamo_dist_per_rank_init(self.rank, self.world_size): @@ -888,7 +888,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase): for r in res[1:]: self.assertEqual(res[0], r) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @config.patch(enable_compiler_collectives=True) def test_compiler_collectives_automatic_dynamic_speculation_divergence(self): with _dynamo_dist_per_rank_init(self.rank, self.world_size): @@ -921,7 +921,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase): for r in res[1:]: self.assertEqual(res[0], r) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @config.patch(enable_compiler_collectives=True) def test_compiler_collectives_graph_break_empty_graph_still_collective(self): with _dynamo_dist_per_rank_init(self.rank, self.world_size): @@ -955,7 +955,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase): for r in res[1:]: self.assertEqual(res[0], r) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @config.patch(enable_compiler_collectives=True) def test_compiler_collectives_dim_mismatch(self): with _dynamo_dist_per_rank_init(self.rank, self.world_size): @@ -984,7 +984,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase): for r in res[1:]: self.assertEqual(res[0], r) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @config.patch(enable_compiler_collectives=True) def test_compiler_collectives_missing_source(self): with _dynamo_dist_per_rank_init(self.rank, self.world_size): @@ -1006,7 +1006,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase): for r in res[1:]: self.assertEqual(res[0], r) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @config.patch(enable_compiler_collectives=True) def test_compiler_collectives_scalar_missing_source(self): with _dynamo_dist_per_rank_init(self.rank, self.world_size): @@ -1028,7 +1028,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase): for r in res[1:]: self.assertEqual(res[0], r) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @config.patch(enable_compiler_collectives=True) def test_compiler_collectives_type_mismatch(self): with _dynamo_dist_per_rank_init(self.rank, self.world_size): @@ -1062,7 +1062,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase): for r in res[1:]: self.assertEqual(res[0], r) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @patch.object(torch._inductor.config, "fx_graph_cache", False) @patch.object(torch._inductor.config, "fx_graph_remote_cache", False) def test_asymmetric_compilation(self): @@ -1113,7 +1113,7 @@ class TestMultiProc(DynamoDistributedMultiProcTestCase): for r in res[1:]: self.assertEqual(res[0], r) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @patch.object(torch._inductor.config, "fx_graph_cache", True) @patch.object(torch._inductor.config, "fx_graph_remote_cache", False) @patch.object(torch._inductor.config, "sleep_sec_TESTING_ONLY", 10) @@ -1203,7 +1203,7 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase): outputs = ddp_m(inputs) self.assertTrue(same(correct_outputs, outputs)) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @patch.object(config, "optimize_ddp", False) def test_ddp_baseline_inductor(self): from torch.nn.parallel import DistributedDataParallel as DDP @@ -1299,7 +1299,7 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase): self.assertTrue(all("DDPOptimizer" in r.reason for r in break_reasons)) @patch.object(config, "optimize_ddp", True) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") def test_graph_split_inductor(self): assert config.optimize_ddp """ @@ -1368,18 +1368,18 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase): opt_outputs = opt_fn(inputs) self.assertTrue(same(correct_outputs, opt_outputs)) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") def test_graph_split_inductor_layout_optimizations_training(self): self._test_graph_split_inductor_layout_optimizations_impl( contextlib.nullcontext ) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") def test_graph_split_inductor_layout_optimizations_inference(self): self._test_graph_split_inductor_layout_optimizations_impl(torch.no_grad) @patch.object(config, "optimize_ddp", True) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") def test_graph_split_inductor_transpose(self): assert config.optimize_ddp @@ -1470,7 +1470,7 @@ class TestSingleProc(DynamoDistributedSingleProcTestCase): self.assertTrue(same(correct_outputs, opt_outputs)) self.assertEqual(check_splits_compiler.compiler_called, 3) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") def test_empty_graph_inductor(self): def fn(): get_world_size = torch.distributed.distributed_c10d.get_world_size() diff --git a/test/distributed/test_functional_api.py b/test/distributed/test_functional_api.py index 2624ce87089f..c4972a464058 100644 --- a/test/distributed/test_functional_api.py +++ b/test/distributed/test_functional_api.py @@ -14,7 +14,7 @@ from functorch import make_fx from torch._inductor.utils import run_and_get_code from torch.testing import FileCheck from torch.testing._internal.distributed.fake_pg import FakeStore -from torch.testing._internal.inductor_utils import HAS_GPU +from torch.utils._triton import has_triton if not dist.is_available(): @@ -564,7 +564,7 @@ class TestCollectivesWithNCCL(MultiProcessTestCase): expected = torch.cat(expected) self.assertEqual(y, expected) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @requires_nccl() @with_comms() def test_tracing(self): @@ -574,7 +574,7 @@ class TestCollectivesWithNCCL(MultiProcessTestCase): compiled_allreduce = torch.compile(allreduce, fullgraph=True) compiled_allreduce(torch.randn(8, device=self.device), self.process_group) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") def test_tracing_with_fakepg(self): exit_if_lt_x_gpu(self.world_size) @@ -590,7 +590,7 @@ class TestCollectivesWithNCCL(MultiProcessTestCase): ) allreduce(torch.randn(8, device=self.device), pg=dist.group.WORLD) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @requires_nccl() @with_comms() def test_tracing_with_dce_code(self): diff --git a/test/distributed/test_inductor_collectives.py b/test/distributed/test_inductor_collectives.py index 216643e59cde..a19183425d39 100644 --- a/test/distributed/test_inductor_collectives.py +++ b/test/distributed/test_inductor_collectives.py @@ -29,7 +29,7 @@ from torch.testing._internal.common_utils import ( parametrize, requires_cuda, ) -from torch.testing._internal.inductor_utils import HAS_GPU +from torch.utils._triton import has_triton def _tolist_with_constrain_as_size(tensor): @@ -58,7 +58,7 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase): # works around issue with skipif<2 and workers with unpredictable #s gpu return 2 - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @skip_if_lt_x_gpu(2) def test_broadcast_inductor(self): """ @@ -90,7 +90,7 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase): compiled_out = compiled_func(*inputs) self.assertTrue(same(eager_out, compiled_out)) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @skip_if_lt_x_gpu(2) def test_allreduce_inductor(self): """ @@ -123,7 +123,7 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase): inductor_out = compiled_matmul_cat_col(*inputs) self.assertTrue(same(eager_out, inductor_out, tol=0.001)) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @skip_if_lt_x_gpu(2) def test_allreduce_inductor_cudagraph_trees(self): """ @@ -169,7 +169,7 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase): op = torch.ops.c10d_functional.all_reduce.default self.assertIn(torch.Tag.pt2_compliant_tag, op.tags) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @skip_if_lt_x_gpu(2) def test_eager_allreduce_inductor_wait(self): def eager_func(a, b, c, d, *, tag, ranks, group_size): @@ -208,7 +208,7 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase): print(f"inductor_out, {inductor_out}") self.assertTrue(same(eager_out, inductor_out, tol=0.001)) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @skip_if_lt_x_gpu(2) def test_inductor_allreduce_eager_wait(self): def inductor_func(a, b, c, d, *, tag, ranks, group_size): @@ -243,7 +243,7 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase): ) self.assertTrue(same(eager_out, inductor_out, tol=0.001)) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @skip_if_lt_x_gpu(2) @patch.object(torch._inductor.config, "allow_buffer_reuse", True) def test_allreduce_input_buffer_reuse(self): @@ -261,7 +261,7 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase): correct = func(inputs, **self.get_world_trs()) self.assertTrue(same(out, correct)) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @skip_if_lt_x_gpu(2) def test_permute_tensor(self): def func(tensor, src_dst_pairs, *, tag, ranks, group_size): @@ -287,7 +287,7 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase): self.assertEqual(out, expected) self.assertEqual(correct, expected) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @skip_if_lt_x_gpu(2) @patch.object(torch._inductor.config, "allow_buffer_reuse", True) def test_allgather_output_buffer_reuse(self): @@ -311,7 +311,7 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase): correct = model(inp, self.world_size, **self.get_world_trs()) self.assertTrue(same(out, correct)) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @skip_if_lt_x_gpu(2) def test_allgather_contiguous_input(self): class Model(torch.nn.Module): @@ -335,7 +335,7 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase): correct = model(inp, self.world_size, **self.get_world_trs()) self.assertTrue(same(out, correct)) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @skip_if_lt_x_gpu(2) def test_allgather_into_tensor_inductor(self): """ @@ -366,7 +366,7 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase): inductor_out = compiled_matmul_cat_col(*inputs) self.assertTrue(same(eager_out, inductor_out, tol=0.001)) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @skip_if_lt_x_gpu(2) def test_reduce_scatter_tensor_inductor(self): def example(a, b, *, tag, ranks, group_size): @@ -393,7 +393,7 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase): inductor_out = compiled_fn(*inputs) self.assertTrue(same(eager_out, inductor_out, tol=0.001)) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @skip_if_lt_x_gpu(2) @patch.object(torch._dynamo.config, "capture_scalar_outputs", True) def test_all_to_all_single_inductor(self): @@ -462,7 +462,7 @@ class TestCollectivesMultiProc(DynamoDistributedMultiProcTestCase): inductor_out = compiled_fn(*inputs, **trs) self.assertTrue(same(eager_out, inductor_out, tol=0.001)) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @skip_if_lt_x_gpu(2) def test_all_to_all_single_inductor_split_sizes_none(self): def example(inp, *, tag, ranks, group_size): @@ -518,7 +518,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase): "group_size": world_size, } - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @torch._inductor.config.patch(debug=True) def test_inductor_single_op(self): def func(inp, *, tag, ranks, group_size): @@ -547,7 +547,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase): correct = func(inputs, **self.get_world_trs()) self.assertTrue(same(out, correct)) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @torch._inductor.config.patch(debug=True) def test_inductor_steal_buffer(self): """ @@ -582,7 +582,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase): correct = func(inputs, **self.get_world_trs()) self.assertTrue(same(out, correct)) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @torch._inductor.config.patch({"debug": True, "triton.descriptive_names": False}) def test_inductor_doesnt_mutate_shared(self): """ @@ -1030,7 +1030,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase): out = torch.ops.c10d_functional.all_reduce(x, "sum", **self.get_world_trs()) self.assertEqual(x.size(), out.size()) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @torch._inductor.config.patch({"debug": True, "triton.descriptive_names": False}) def test_inductor_all_gather_coalesced(self): """ @@ -1076,7 +1076,7 @@ class TestCollectivesInductor(DynamoDistributedSingleProcTestCase): correct = func(inputs, **self.get_world_trs()) assert same(out, correct), f"{out} va {correct}" - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @torch._inductor.config.patch({"debug": True, "triton.descriptive_names": False}) def test_inductor_reduce_scatter_coalesced(self): """ diff --git a/test/inductor/test_cuda_repro.py b/test/inductor/test_cuda_repro.py index cb6d139bbcf3..019be5af16fd 100644 --- a/test/inductor/test_cuda_repro.py +++ b/test/inductor/test_cuda_repro.py @@ -1319,9 +1319,7 @@ class CudaReproTests(TestCase): self.assertEqual(expect, actual) # Expect the code iterates in contiguous order, and is not tiled - lines = code[0].split("\n") - start = lines.index("@triton.jit") - kernel_code = "\n".join(lines[start : start + 14]) + kernel_code = "\n".join(code[0].split("\n")[60:74]) self.assertExpectedInline( kernel_code, """\ diff --git a/test/inductor/test_memory_planning.py b/test/inductor/test_memory_planning.py index 6ded0991319f..6331c91904eb 100644 --- a/test/inductor/test_memory_planning.py +++ b/test/inductor/test_memory_planning.py @@ -22,9 +22,10 @@ from torch._inductor import config from torch._inductor.test_case import run_tests, TestCase from torch._inductor.utils import run_and_get_cpp_code from torch.export import Dim +from torch.utils._triton import has_triton -@unittest.skipIf(not HAS_CUDA, "Inductor+gpu needs triton and CUDA") +@unittest.skipIf(not has_triton(), "Inductor+gpu needs triton and recent GPU arch") @config.patch(memory_planning=True) class TestMemoryPlanning(TestCase): def _generate(self, *, device): diff --git a/test/inductor/test_triton_cpu_backend.py b/test/inductor/test_triton_cpu_backend.py deleted file mode 100644 index 8be4d85caea6..000000000000 --- a/test/inductor/test_triton_cpu_backend.py +++ /dev/null @@ -1,34 +0,0 @@ -# Owner(s): ["module: inductor"] -from torch._inductor import config -from torch._inductor.test_case import run_tests -from torch.testing._internal.inductor_utils import HAS_CPU -from torch.utils._triton import has_triton - - -try: - from . import test_torchinductor -except ImportError: - import test_torchinductor - -if has_triton(): - import triton - - TRITON_HAS_CPU = "cpu" in triton.backends.backends -else: - TRITON_HAS_CPU = False - - -if HAS_CPU and TRITON_HAS_CPU: - - @config.patch(cpu_backend="triton") - class SweepInputsCpuTritonTest(test_torchinductor.SweepInputsCpuTest): - pass - - @config.patch(cpu_backend="triton") - class CpuTritonTests(test_torchinductor.CpuTests): - pass - - -if __name__ == "__main__": - if HAS_CPU and TRITON_HAS_CPU: - run_tests(needs="filelock") diff --git a/test/test_sparse_semi_structured.py b/test/test_sparse_semi_structured.py index 6bac38723d30..0f7c96065bcd 100644 --- a/test/test_sparse_semi_structured.py +++ b/test/test_sparse_semi_structured.py @@ -38,10 +38,10 @@ from torch.testing._internal.common_utils import ( IS_WINDOWS, ) -from torch.testing._internal.inductor_utils import HAS_GPU - import pytest +from torch.utils._triton import has_triton + SEMI_STRUCTURED_SUPPORTED_BACKENDS = dict() _IS_SM8X = False @@ -981,7 +981,7 @@ class TestSparseSemiStructuredCUTLASS(TestCase): torch.backends.cuda.matmul.allow_tf32 = orig - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Test needs triton and recent GPU arch") @inference_dtypes def test_conversions(self, device, dtype): @@ -1009,7 +1009,7 @@ class TestSparseSemiStructuredCUTLASS(TestCase): for r, c in shapes: run_test(r, c, device, dtype) - @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") + @unittest.skipIf(not has_triton(), "Test needs triton and recent GPU arch") @inference_dtypes def test_conversions_all_patterns(self, device, dtype): r, c = 32, 128 diff --git a/torch/_dynamo/device_interface.py b/torch/_dynamo/device_interface.py index ad58a11bbd56..00975cda5508 100644 --- a/torch/_dynamo/device_interface.py +++ b/torch/_dynamo/device_interface.py @@ -1,7 +1,5 @@ # mypy: allow-untyped-defs import inspect -import time -from dataclasses import dataclass from typing import Any, Callable, Dict, Iterable, Optional, Tuple, Type, Union import torch @@ -297,51 +295,6 @@ class XpuInterface(DeviceInterface): return torch.xpu.is_bf16_supported() -@dataclass -class CpuDeviceProperties: - multi_processor_count: int - - -class CpuInterface(DeviceInterface): - class Event(_EventBase): - def __init__(self, enable_timing=True): - self.time = 0.0 - - def elapsed_time(self, end_event) -> float: - return (end_event.time - self.time) * 1000 - - def record(self): - self.time = time.perf_counter() - - @staticmethod - def is_available() -> bool: - return True - - @staticmethod - def get_compute_capability(device: _device_t = None) -> str: - return "" - - @staticmethod - def get_raw_stream(device_idx) -> int: - return 0 - - @staticmethod - def current_device(): - return 0 - - @staticmethod - def synchronize(device: _device_t = None): - pass - - class Worker: - @staticmethod - def get_device_properties(device: _device_t = None): - import multiprocessing - - cpu_count = multiprocessing.cpu_count() - return CpuDeviceProperties(cpu_count) - - device_interfaces: Dict[str, Type[DeviceInterface]] = {} _device_initialized = False @@ -380,6 +333,4 @@ def init_device_reg(): for i in range(torch.xpu.device_count()): register_interface_for_device(f"xpu:{i}", XpuInterface) - register_interface_for_device("cpu", CpuInterface) - _device_initialized = True diff --git a/torch/_inductor/autotune_process.py b/torch/_inductor/autotune_process.py index 8dcda8fa3d7d..94efbaf4e32f 100644 --- a/torch/_inductor/autotune_process.py +++ b/torch/_inductor/autotune_process.py @@ -574,7 +574,7 @@ class TestBenchmarkRequest(BenchmarkRequest): return self.value -class GPUDeviceBenchmarkMixin: +class GPUDeviceBenchmarkRequest(BenchmarkRequest): def do_bench( self, fn, @@ -601,17 +601,7 @@ class GPUDeviceBenchmarkMixin: return out -class CPUDeviceBenchmarkMixin: - def do_bench( - self, - fn, - *input_tensors: torch.Tensor, - output_tensor: Optional[torch.Tensor] = None, - ) -> float: - return benchmarker.benchmark_cpu(fn) - - -class TritonBenchmarkRequest(BenchmarkRequest): +class TritonBenchmarkRequest(GPUDeviceBenchmarkRequest): # Important: Instances of this class have to be serializable # across process boundaries. Do not put CUDA Tensors in here! def __init__( @@ -656,22 +646,28 @@ class TritonBenchmarkRequest(BenchmarkRequest): if "warmup" in inspect.signature(run_method).parameters: warmup_arg["warmup"] = False - if output_tensor.device.type == "cpu": - stream = 0 + from torch._C import _cuda_getCurrentRawStream as get_raw_stream + + if torch.version.hip and self.matrix_instr_nonkdim != 0: + return functools.partial( + run_method, + *input_tensors, + output_tensor, + *self.extra_args, + grid=self.grid, + **warmup_arg, + stream=get_raw_stream(self.output_tensor_meta.device.index), + ) else: - from torch._C import _cuda_getCurrentRawStream as get_raw_stream - - stream = get_raw_stream(self.output_tensor_meta.device.index) - - return functools.partial( - run_method, - *input_tensors, - output_tensor, - *self.extra_args, - grid=self.grid, - **warmup_arg, - stream=stream, - ) + return functools.partial( + run_method, + *input_tensors, + output_tensor, + *self.extra_args, + grid=self.grid, + **warmup_arg, + stream=get_raw_stream(self.output_tensor_meta.device.index), + ) def precompile(self): mod = PyCodeCache.load_by_key_path(self.module_cache_key, self.module_path) @@ -681,15 +677,7 @@ class TritonBenchmarkRequest(BenchmarkRequest): return f"{self.kernel_name=}, {self.module_path=}, {self.module_cache_key=}" -class TritonGPUBenchmarkRequest(GPUDeviceBenchmarkMixin, TritonBenchmarkRequest): - pass - - -class TritonCPUBenchmarkRequest(CPUDeviceBenchmarkMixin, TritonBenchmarkRequest): - pass - - -class CUDABenchmarkRequest(GPUDeviceBenchmarkMixin, BenchmarkRequest): +class CUDABenchmarkRequest(GPUDeviceBenchmarkRequest): # Important: Instances of this class have to be serializable # across process boundaries. Do not put CUDA Tensors in here! @@ -806,7 +794,17 @@ class CUDABenchmarkRequest(GPUDeviceBenchmarkMixin, BenchmarkRequest): return f"{self.kernel_name=}, {self.source_file=}, {self.hash_key=}" -class CppBenchmarkRequest(CPUDeviceBenchmarkMixin, BenchmarkRequest): +class CPUDeviceBenchmarkRequest(BenchmarkRequest): + def do_bench( + self, + fn, + *input_tensors: torch.Tensor, + output_tensor: Optional[torch.Tensor] = None, + ) -> float: + return benchmarker.benchmark_cpu(fn) + + +class CppBenchmarkRequest(CPUDeviceBenchmarkRequest): # Important: Instances of this class have to be serializable # across process boundaries. Do not put Tensors in here! diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py index bd465abe1ecb..045e7d45480b 100644 --- a/torch/_inductor/codegen/common.py +++ b/torch/_inductor/codegen/common.py @@ -240,11 +240,7 @@ def init_backend_registration(): from .wrapper import PythonWrapperCodegen if get_scheduling_for_device("cpu") is None: - cpu_backends = { - "cpp": CppScheduling, - "halide": HalideScheduling, - "triton": TritonScheduling, - } + cpu_backends = {"cpp": CppScheduling, "halide": HalideScheduling} register_backend_for_device( "cpu", lambda *args, **kwargs: cpu_backends[config.cpu_backend](*args, **kwargs), @@ -306,7 +302,6 @@ def get_device_op_overrides(device: str): assert isinstance(device, str) if not device_op_overrides_dict.keys(): - from . import cpu_device_op_overrides # noqa: F401 from .cuda import device_op_overrides # noqa: F401 from .xpu import device_op_overrides as xpu_op_overrides # noqa: F401 diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py index 06eb08c6bec5..9c0863249797 100644 --- a/torch/_inductor/codegen/cpp.py +++ b/torch/_inductor/codegen/cpp.py @@ -4652,7 +4652,7 @@ class KernelGroup: def call_kernel(self, wrapper, kernel_name): _, call_args, arg_types = self.args.cpp_argdefs() wrapper.generate_kernel_call( - kernel_name, call_args, gpu=False, triton=False, arg_types=arg_types + kernel_name, call_args, gpu=False, arg_types=arg_types ) diff --git a/torch/_inductor/codegen/cpp_template_kernel.py b/torch/_inductor/codegen/cpp_template_kernel.py index 7ac125c76cc6..e72a895dc44d 100644 --- a/torch/_inductor/codegen/cpp_template_kernel.py +++ b/torch/_inductor/codegen/cpp_template_kernel.py @@ -107,9 +107,7 @@ class CppTemplateKernel(CppKernel): def call_kernel(self, name: str, node: ir.CppTemplateBuffer): wrapper = V.graph.wrapper_code _, call_args, arg_types = self.args.cpp_argdefs() - wrapper.generate_kernel_call( - name, call_args, triton=False, gpu=False, arg_types=arg_types - ) + wrapper.generate_kernel_call(name, call_args, gpu=False, arg_types=arg_types) def dtype(self, node: ir.Buffer) -> str: return DTYPE_TO_CPP[node.get_dtype()] diff --git a/torch/_inductor/codegen/cpu_device_op_overrides.py b/torch/_inductor/codegen/cpu_device_op_overrides.py deleted file mode 100644 index 1944c0e6beb8..000000000000 --- a/torch/_inductor/codegen/cpu_device_op_overrides.py +++ /dev/null @@ -1,26 +0,0 @@ -# mypy: allow-untyped-defs -from textwrap import dedent - -from .common import DeviceOpOverrides, register_device_op_overrides - - -class CpuDeviceOpOverrides(DeviceOpOverrides): - def import_get_raw_stream_as(self, name): - return dedent( - """ - def get_raw_stream(_): - return 0 - """ - ) - - def set_device(self, device_idx): - return "pass" - - def synchronize(self): - return "pass" - - def device_guard(self, device_idx): - return "pass" - - -register_device_op_overrides("cpu", CpuDeviceOpOverrides()) diff --git a/torch/_inductor/codegen/halide.py b/torch/_inductor/codegen/halide.py index bbf6866fdc38..337aa544b0d1 100644 --- a/torch/_inductor/codegen/halide.py +++ b/torch/_inductor/codegen/halide.py @@ -1639,7 +1639,6 @@ class HalideKernel(SIMDKernel): name, call_args, gpu=False, # grid/stream is handled internally in halide - triton=False, ) def generate_assert(self, check): diff --git a/torch/_inductor/codegen/rocm/rocm_benchmark_request.py b/torch/_inductor/codegen/rocm/rocm_benchmark_request.py index bdd68d9a2e6b..a70f45b7033d 100644 --- a/torch/_inductor/codegen/rocm/rocm_benchmark_request.py +++ b/torch/_inductor/codegen/rocm/rocm_benchmark_request.py @@ -7,18 +7,14 @@ from ctypes import byref, c_int, c_size_t, c_void_p from typing import Any, Callable, Iterable, List, Optional, Union import torch -from torch._inductor.autotune_process import ( - BenchmarkRequest, - GPUDeviceBenchmarkMixin, - TensorMeta, -) +from torch._inductor.autotune_process import GPUDeviceBenchmarkRequest, TensorMeta from torch._inductor.codecache import DLLWrapper, ROCmCodeCache log = logging.getLogger(__name__) -class ROCmBenchmarkRequest(GPUDeviceBenchmarkMixin, BenchmarkRequest): +class ROCmBenchmarkRequest(GPUDeviceBenchmarkRequest): # Important: Instances of this class have to be serializable # across process boundaries. Do not put CUDA Tensors in here! diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py index 58773661477b..d2f20840c5d5 100644 --- a/torch/_inductor/codegen/triton.py +++ b/torch/_inductor/codegen/triton.py @@ -2689,11 +2689,6 @@ class TritonKernel(SIMDKernel): if name is None: code.splice(gen_common_triton_imports()) - device_type = V.graph.scheduler.get_current_device_or_throw().type - if device_type == "cpu": - code.splice("triton_helpers.set_driver_to_cpu()") - else: - code.splice("triton_helpers.set_driver_to_gpu()") if config.benchmark_kernel: code.splice(self.imports_for_benchmark_kernel()) @@ -2931,7 +2926,7 @@ class TritonKernel(SIMDKernel): call_args, grid, current_device.index, - gpu=current_device.type != "cpu", + gpu=True, triton=True, arg_types=arg_types, grid_fn=self._get_grid_fn_str(), diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py index db61354294a2..62aee3bf19a1 100644 --- a/torch/_inductor/codegen/wrapper.py +++ b/torch/_inductor/codegen/wrapper.py @@ -1658,102 +1658,104 @@ class PythonWrapperCodegen(CodeGen): gpu: Defines whether the backend is GPU. Otherwise the backend is CPU. - triton: Defines whether the backend uses Triton for codegen. Otherwise it uses the CUDA language when gpu=True, - and C++ when gpu=False. + triton: Defines whether the GPU backend uses Triton for codegen. + Otherwise it uses the CUDA language for codegen. + Only valid when gpu == True. """ - if not (triton or gpu): - self.writeline(self.wrap_kernel_call(kernel_name, call_args)) - return - - device_index, call_args_str = self.prepare_triton_kernel_call( - device_index, call_args - ) - call_args_str = ", ".join(call_args_str) - stream_name = self.write_get_raw_stream(device_index, V.graph) - - if not triton: - stream_ptr = f"c_void_p({stream_name})" - self.writeline( - f"{kernel_name}.{kernel_name}({call_args_str}, {stream_ptr})" + if gpu: + device_index, call_args_str = self.prepare_triton_kernel_call( + device_index, call_args ) - return - - self.write_triton_header_once() - if grid is None: - grid_str = grid_fn - else: - grid_str = ", ".join(self._grid_dim_str(item) for item in grid) - if grid_extra_kwargs: - grid_str = f"{grid_str}, {grid_extra_kwargs}" - grid_str = f"{grid_fn}({grid_str})" - # add debug printer code for triton kernel calls at (jit) inductor level - debug_printer_manager = V.graph.wrapper_code.debug_printer - debug_printer_manager.set_printer_args(call_args, kernel_name, arg_types, None) - with debug_printer_manager: - self.writeline( - f"{kernel_name}.run({call_args_str}, grid={grid_str}, stream={stream_name})" - ) - if ( - config.triton.autotune_at_compile_time - and kernel_name not in self.kernel_autotune_names - ): - # Create example args for autotune in a separate epilogue - assert arg_types is not None and len(call_args) == len( - arg_types - ), "call_args and arg_types do not match" - - tensor_args = {} - all_args = [] - if raw_args is None: - # create a dummy raw_args for uniform behavior in the following loop - raw_args = [None] * len(call_args) - else: - assert len(raw_args) == len( - call_args - ), "call_args and raw_args do not match" - - for i, (arg, arg_type, raw_arg) in enumerate( - zip(call_args, arg_types, raw_args) - ): - key = None - if isinstance(arg, str) and "=" in str(arg): - # arg may be passed in a kwarg style, and then we need to extract its value - key, arg = arg.split("=") - - if isinstance(arg_type, torch_dtype): - # workspace allocation is already generated by `generate_workspace_allocation()` - # in `TritonKernel.call_kernel()`. - if arg == "workspace": - arg_str = "workspace" - tensor_args[arg] = arg_str - elif arg not in tensor_args: - arg_str = self.generate_example_arg_value( - arg, arg_type, raw_arg, i - ) - tensor_args[arg] = arg_str - else: - arg_str = tensor_args[arg] + call_args_str = ", ".join(call_args_str) + stream_name = self.write_get_raw_stream(device_index, V.graph) + if triton: + self.write_triton_header_once() + if grid is None: + grid_str = grid_fn else: - arg_str = self.generate_example_arg_value(arg, arg_type, raw_arg, i) - all_args.append(arg_str if key is None else f"{key}={arg_str}") - - if grid is None: - grid_str = grid_fn - else: - grid_str = ", ".join( - self.generate_example_arg_value(g, type(g)) for g in grid + grid_str = ", ".join(self._grid_dim_str(item) for item in grid) + if grid_extra_kwargs: + grid_str = f"{grid_str}, {grid_extra_kwargs}" + grid_str = f"{grid_fn}({grid_str})" + # add debug printer code for triton kernel calls at (jit) inductor level + debug_printer_manager = V.graph.wrapper_code.debug_printer + debug_printer_manager.set_printer_args( + call_args, kernel_name, arg_types, None ) - if grid_extra_kwargs: - grid_str = f"{grid_str}, {grid_extra_kwargs}" - grid_str = f"{grid_fn}({grid_str})" + with debug_printer_manager: + self.writeline( + f"{kernel_name}.run({call_args_str}, grid={grid_str}, stream={stream_name})" + ) + if ( + config.triton.autotune_at_compile_time + and kernel_name not in self.kernel_autotune_names + ): + # Create example args for autotune in a separate epilogue + assert arg_types is not None and len(call_args) == len( + arg_types + ), "call_args and arg_types do not match" - self.kernel_autotune_calls.writeline( - f"{kernel_name}.run({', '.join(all_args)}, grid={grid_str}, stream={stream_name})" - ) - self.kernel_autotune_calls.writeline( - f"del {', '.join(arg for arg in tensor_args.values())}\n", - ) - self.kernel_autotune_names.add(kernel_name) + tensor_args = {} + all_args = [] + if raw_args is None: + # create a dummy raw_args for uniform behavior in the following loop + raw_args = [None] * len(call_args) + else: + assert len(raw_args) == len( + call_args + ), "call_args and raw_args do not match" + + for i, (arg, arg_type, raw_arg) in enumerate( + zip(call_args, arg_types, raw_args) + ): + key = None + if isinstance(arg, str) and "=" in str(arg): + # arg may be passed in a kwarg style, and then we need to extract its value + key, arg = arg.split("=") + + if isinstance(arg_type, torch_dtype): + # workspace allocation is already generated by `generate_workspace_allocation()` + # in `TritonKernel.call_kernel()`. + if arg == "workspace": + arg_str = "workspace" + tensor_args[arg] = arg_str + elif arg not in tensor_args: + arg_str = self.generate_example_arg_value( + arg, arg_type, raw_arg, i + ) + tensor_args[arg] = arg_str + else: + arg_str = tensor_args[arg] + else: + arg_str = self.generate_example_arg_value( + arg, arg_type, raw_arg, i + ) + all_args.append(arg_str if key is None else f"{key}={arg_str}") + + if grid is None: + grid_str = grid_fn + else: + grid_str = ", ".join( + self.generate_example_arg_value(g, type(g)) for g in grid + ) + if grid_extra_kwargs: + grid_str = f"{grid_str}, {grid_extra_kwargs}" + grid_str = f"{grid_fn}({grid_str})" + + self.kernel_autotune_calls.writeline( + f"{kernel_name}.run({', '.join(all_args)}, grid={grid_str}, stream={stream_name})" + ) + self.kernel_autotune_calls.writeline( + f"del {', '.join(arg for arg in tensor_args.values())}\n", + ) + self.kernel_autotune_names.add(kernel_name) + else: + stream_ptr = f"c_void_p({stream_name})" + self.writeline( + f"{kernel_name}.{kernel_name}({call_args_str}, {stream_ptr})" + ) + else: + self.writeline(self.wrap_kernel_call(kernel_name, call_args)) def writeline(self, line): self.lines.append(line) diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py index f3f596d06963..aaa8d9bf6b14 100644 --- a/torch/_inductor/config.py +++ b/torch/_inductor/config.py @@ -1151,7 +1151,7 @@ class rocm: use_preselected_instances: bool = False -# Backend to use for CPU codegen either "cpp" or "triton" (experimental) or "halide" (experimental) +# Backend to use for CPU codegen either "cpp" or "halide" (experimental) cpu_backend = "cpp" # Backend to use for CUDA codegen either "triton" or "halide" (experimental) diff --git a/torch/_inductor/runtime/benchmarking.py b/torch/_inductor/runtime/benchmarking.py index 8d521c0d714f..44c222c3f936 100644 --- a/torch/_inductor/runtime/benchmarking.py +++ b/torch/_inductor/runtime/benchmarking.py @@ -77,7 +77,7 @@ class Benchmarker: def benchmark( self: Self, fn: Callable[..., Any], - fn_args: Tuple[Any, ...], + fn_args: Tuple[Any], fn_kwargs: Dict[str, Any], **kwargs: Any, ) -> float: diff --git a/torch/_inductor/runtime/triton_helpers.py b/torch/_inductor/runtime/triton_helpers.py index 417f6734e432..1a8ff2ba2408 100644 --- a/torch/_inductor/runtime/triton_helpers.py +++ b/torch/_inductor/runtime/triton_helpers.py @@ -1,7 +1,5 @@ # mypy: allow-untyped-decorators # mypy: allow-untyped-defs -import warnings - import triton import triton.language as tl @@ -33,32 +31,6 @@ except ImportError: raise NotImplementedError -def set_driver_to_cpu(): - driver = triton.runtime.driver - if backend := triton.backends.backends.get("cpu", None): - if isinstance(driver.active, backend.driver): - # Don't re-initialize backend if it is already active - return - driver.set_active(backend.driver()) - return - # This can be a hard error once triton-cpu is merged into fbcode - warnings.warn( - "Could not find an active CPU backend. Generated kernels will not be executable!" - ) - - -def set_driver_to_gpu(): - driver = triton.runtime.driver - for name, backend in triton.backends.backends.items(): - if backend.driver.is_active() and name != "cpu": - if isinstance(driver.active, backend.driver): - # Don't re-initialize backend if it is already active - return - driver.set_active(backend.driver()) - return - raise RuntimeError("Could not find an active GPU backend") - - @triton.jit def promote_to_tensor(x): # Addition promotes to tensor for us diff --git a/torch/_inductor/runtime/triton_heuristics.py b/torch/_inductor/runtime/triton_heuristics.py index 6e4a628d71aa..4463d8ce3bb1 100644 --- a/torch/_inductor/runtime/triton_heuristics.py +++ b/torch/_inductor/runtime/triton_heuristics.py @@ -57,8 +57,6 @@ if triton is not None: from triton.runtime.autotuner import OutOfResources from triton.runtime.jit import KernelInterface - from . import triton_helpers - try: from triton.compiler.compiler import ASTSource except ImportError: @@ -69,14 +67,11 @@ if triton is not None: except ImportError: GPUTarget = None else: - from types import ModuleType - Config = object KernelInterface = object OutOfResources = object ASTSource = None GPUTarget = None - triton_helpers = ModuleType("triton_helpers") try: autograd_profiler = torch.autograd.profiler @@ -379,11 +374,6 @@ class CachingAutotuner(KernelInterface): compile_meta["device_type"] = self.device_props.type compile_meta["cc"] = self.device_props.cc - if self.device_props.type == "cpu": - triton_helpers.set_driver_to_cpu() - else: - triton_helpers.set_driver_to_gpu() - if ASTSource: compile_args = ( ASTSource( @@ -686,9 +676,6 @@ class CachingAutotuner(KernelInterface): return do_bench_using_profiling(kernel_call, warmup=10, rep=40) - if self.device_props.type == "cpu": - return benchmarker.benchmark_cpu(kernel_call) - return benchmarker.benchmark_gpu(kernel_call, rep=40) def clone_args(self, *args, **kwargs) -> Tuple[List[Any], Dict[str, Any]]: diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py index 53229547dc21..c2a3604aea16 100644 --- a/torch/_inductor/scheduler.py +++ b/torch/_inductor/scheduler.py @@ -3473,11 +3473,12 @@ class Scheduler: self.current_device.type ): V.graph.wrapper_code.codegen_device_guard_exit() - self.current_device = device if device_need_guard(device.type): assert device.index is not None, "device should have an index" V.graph.wrapper_code.codegen_device_guard_enter(device.index) + self.current_device = device + self.buffer_names_to_free.update(node.last_usage) if node.is_template(): diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py index 75e994f8120d..e0ffd5f4d1aa 100644 --- a/torch/_inductor/select_algorithm.py +++ b/torch/_inductor/select_algorithm.py @@ -15,7 +15,7 @@ import time from collections import namedtuple from concurrent.futures import as_completed, ThreadPoolExecutor from io import StringIO -from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union from unittest.mock import patch import sympy @@ -27,12 +27,7 @@ from torch._dynamo.testing import rand_strided from torch._dynamo.utils import counters, identity, preserve_rng_state from . import config, ir -from .autotune_process import ( - TensorMeta, - TritonBenchmarkRequest, - TritonCPUBenchmarkRequest, - TritonGPUBenchmarkRequest, -) +from .autotune_process import TensorMeta, TritonBenchmarkRequest from .codecache import code_hash, PersistentCache, PyCodeCache from .codegen.common import IndentedBuffer, KernelTemplate from .codegen.triton import ( @@ -582,7 +577,6 @@ class TritonTemplateKernel(TritonKernel): grid_fn=f"{self.grid_fn.__module__}.{self.grid_fn.__name__}", arg_types=arg_types, triton_meta=self.triton_meta, - gpu="cpu" not in V.graph.device_types, ) @@ -752,12 +746,7 @@ class TritonTemplate(KernelTemplate): ), kwargs, ) - bmreq_cls: Type[TritonBenchmarkRequest] - if layout.device.type == "cpu": - bmreq_cls = TritonCPUBenchmarkRequest - else: - bmreq_cls = TritonGPUBenchmarkRequest - bmreq = bmreq_cls( + bmreq = TritonBenchmarkRequest( module_path=mod.__file__, module_cache_key=mod.key, kernel_name=kernel_name, diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py index fdb7f1a9d173..fd3f0fa70074 100644 --- a/torch/_inductor/utils.py +++ b/torch/_inductor/utils.py @@ -1099,13 +1099,7 @@ def use_triton_template(layout, *, enable_int32=False, enable_float8=False): if enable_float8: layout_dtypes.extend([torch.float8_e4m3fn, torch.float8_e5m2]) return ( - ( - ( - layout.device.type == "cuda" - and _use_template_for_cuda(layout, layout_dtypes) - ) - or (layout.device.type == "cpu" and layout.dtype in layout_dtypes) - ) + _use_template_for_cuda(layout, layout_dtypes) and _use_autotune_backend("TRITON") and has_backend_feature(layout.device, BackendFeature.TRITON_TEMPLATES) ) diff --git a/torch/utils/_triton.py b/torch/utils/_triton.py index 3cc977c84838..92b42320c3c6 100644 --- a/torch/utils/_triton.py +++ b/torch/utils/_triton.py @@ -17,27 +17,15 @@ def has_triton_package() -> bool: @functools.lru_cache(None) def has_triton() -> bool: - if not has_triton_package(): - return False - from torch._dynamo.device_interface import get_interface_for_device def cuda_extra_check(device_interface): return device_interface.Worker.get_device_properties().major >= 7 - def cpu_extra_check(device_interface): - import triton.backends - - return "cpu" in triton.backends.backends - def _return_true(device_interface): return True - triton_supported_devices = { - "cuda": cuda_extra_check, - "xpu": _return_true, - "cpu": cpu_extra_check, - } + triton_supported_devices = {"cuda": cuda_extra_check, "xpu": _return_true} def is_device_compatible_with_triton(): for device, extra_check in triton_supported_devices.items(): @@ -46,7 +34,7 @@ def has_triton() -> bool: return True return False - return is_device_compatible_with_triton() + return is_device_compatible_with_triton() and has_triton_package() @functools.lru_cache(None)