Revert "Optimize multi_tensor_apply (take 2) (#119764)"

This reverts commit fe41ba47652ca73569453bddb43605c77bb85184.

Reverted https://github.com/pytorch/pytorch/pull/119764 on behalf of https://github.com/atalman due to Failing internally ([comment](https://github.com/pytorch/pytorch/pull/119764#issuecomment-2024105399))
This commit is contained in:
PyTorch MergeBot
2024-03-27 22:42:07 +00:00
parent 222dfc4282
commit bef01c7c2b
3 changed files with 66 additions and 330 deletions

View File

@ -649,26 +649,14 @@ class TestForeach(TestCase):
@onlyCUDA
@ops(foreach_reduce_op_db, allowed_dtypes=floating_types())
@parametrize("use_cuda_graph", (False, True))
def test_big_num_tensors(self, device, dtype, op, use_cuda_graph):
def test_big_num_tensors(self, device, dtype, op):
N = 600
tensorlist = [make_tensor((2, 3), dtype=dtype, device=device, noncontiguous=False) for _ in range(N)]
fn, ref_fn, *_ = self._get_funcs(op)
import math
for ord in (1, 2, math.inf):
if not use_cuda_graph:
actual = fn(inputs=[tensorlist], is_cuda=True, expect_fastpath=True, ord=ord, zero_size=False)
else:
# When using CUDA graphs and the tensor metadata doesn't fit in
# the static kernel argument space, multi_tensor_apply creates
# the launch arguments once, uses cudaUserObject_t to tie its
# lifetime to the graph, and reuses it throughout replays. This
# test verifies multi_tensor_apply's behavior in the scenario.
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g):
actual = fn.func(tensorlist, ord=ord)
g.replay()
actual = fn(inputs=[tensorlist], is_cuda=True, expect_fastpath=True, ord=ord, zero_size=False)
expect = ref_fn(inputs=[tensorlist], ord=ord)
self.assertEqual(expect, actual, equal_nan=True)