Fix #156261 _foreach_copy indexing (#156719)

Fixes #156261

Thanks to @ngimel's fast eyes

For testing, I had experimented with a broader test case change but found that creating a tensor of 2**31+1 size was too expensive to do more than just a few times. Note that while the test case does not run in CI, I did run it locally to ensure it passes with new changes and fails without.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/156719
Approved by: https://github.com/albanD
This commit is contained in:
Jane Xu
2025-06-24 09:51:08 -07:00
committed by PyTorch MergeBot
parent 310e8361c5
commit 4ee4863232
2 changed files with 12 additions and 3 deletions

View File

@ -1357,6 +1357,9 @@ class TestForeach(TestCase):
def test_foreach_copy_with_multi_dtypes(self, device, dtype, op):
# check (a) multi_tensor_apply is called and (b) numerical parity with for-loop and Tensor.copy_
foreach_copy_ = ForeachFuncWrapper(op.inplace_variant)
tested_large_input = False
for sample in op.sample_inputs(
device, dtype, noncontiguous=False, allow_higher_dtype_scalars=True
):
@ -1364,6 +1367,13 @@ class TestForeach(TestCase):
if src_dtype == dtype:
continue
self_tensors = [t.clone() for t in sample.input]
if not tested_large_input:
# see https://github.com/pytorch/pytorch/issues/156261
self_tensors.append(
torch.empty(2**31 + 1, device=device, dtype=dtype)
)
tested_large_input = True
src_tensors = [t.to(src_dtype) for t in self_tensors]
out = foreach_copy_(
(self_tensors, src_tensors), is_cuda=True, expect_fastpath=True