Revert "Optimize multi_tensor_apply (take 2) (#119764)"

This reverts commit fe41ba47652ca73569453bddb43605c77bb85184. Reverted https://github.com/pytorch/pytorch/pull/119764 on behalf of https://github.com/atalman due to Failing internally ([comment](https://github.com/pytorch/pytorch/pull/119764#issuecomment-2024105399))
2025-10-21 05:34:18 +08:00 · 2024-03-27 22:42:07 +00:00
parent 222dfc4282
commit bef01c7c2b
3 changed files with 66 additions and 330 deletions
--- a/test/test_foreach.py
+++ b/test/test_foreach.py
@ -649,26 +649,14 @@ class TestForeach(TestCase):

    @onlyCUDA
    @ops(foreach_reduce_op_db, allowed_dtypes=floating_types())
-    @parametrize("use_cuda_graph", (False, True))
-    def test_big_num_tensors(self, device, dtype, op, use_cuda_graph):
+    def test_big_num_tensors(self, device, dtype, op):
        N = 600
        tensorlist = [make_tensor((2, 3), dtype=dtype, device=device, noncontiguous=False) for _ in range(N)]
        fn, ref_fn, *_ = self._get_funcs(op)

        import math
        for ord in (1, 2, math.inf):
-            if not use_cuda_graph:
-                actual = fn(inputs=[tensorlist], is_cuda=True, expect_fastpath=True, ord=ord, zero_size=False)
-            else:
-                # When using CUDA graphs and the tensor metadata doesn't fit in
-                # the static kernel argument space, multi_tensor_apply creates
-                # the launch arguments once, uses cudaUserObject_t to tie its
-                # lifetime to the graph, and reuses it throughout replays. This
-                # test verifies multi_tensor_apply's behavior in the scenario.
-                g = torch.cuda.CUDAGraph()
-                with torch.cuda.graph(g):
-                    actual = fn.func(tensorlist, ord=ord)
-                g.replay()
+            actual = fn(inputs=[tensorlist], is_cuda=True, expect_fastpath=True, ord=ord, zero_size=False)
            expect = ref_fn(inputs=[tensorlist], ord=ord)

            self.assertEqual(expect, actual, equal_nan=True)