Revert "Optimize multi_tensor_apply (take 2) (#119764)"

This reverts commit 0b68a28c87df2c6eb2cf530be4659b5a2f8a95b0.

Reverted https://github.com/pytorch/pytorch/pull/119764 on behalf of https://github.com/huydhn due to Sorry for reverting your change but it is failing ROCm job in trunk 0b68a28c87.  Please help take a look and reland the change ([comment](https://github.com/pytorch/pytorch/pull/119764#issuecomment-2014190124))
This commit is contained in:
PyTorch MergeBot
2024-03-22 02:18:28 +00:00
parent 470b44c048
commit 5e0440edb4
3 changed files with 64 additions and 299 deletions

View File

@ -649,26 +649,14 @@ class TestForeach(TestCase):
@onlyCUDA
@ops(foreach_reduce_op_db, allowed_dtypes=floating_types())
@parametrize("use_cuda_graph", (False, True))
def test_big_num_tensors(self, device, dtype, op, use_cuda_graph):
def test_big_num_tensors(self, device, dtype, op):
N = 600
tensorlist = [make_tensor((2, 3), dtype=dtype, device=device, noncontiguous=False) for _ in range(N)]
fn, ref_fn, *_ = self._get_funcs(op)
import math
for ord in (1, 2, math.inf):
if not use_cuda_graph:
actual = fn(inputs=[tensorlist], is_cuda=True, expect_fastpath=True, ord=ord, zero_size=False)
else:
# When using CUDA graphs and the tensor metadata doesn't fit in
# the static kernel argument space, multi_tensor_apply creates
# the launch arguments once, uses cudaUserObject_t to tie its
# lifetime to the graph, and reuses it throughout replays. This
# test verifies multi_tensor_apply's behavior in the scenario.
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g):
actual = fn.func(tensorlist, ord=ord)
g.replay()
actual = fn(inputs=[tensorlist], is_cuda=True, expect_fastpath=True, ord=ord, zero_size=False)
expect = ref_fn(inputs=[tensorlist], ord=ord)
self.assertEqual(expect, actual, equal_nan=True)