Fix #156261 _foreach_copy indexing (#156719)

Fixes #156261 Thanks to @ngimel's fast eyes For testing, I had experimented with a broader test case change but found that creating a tensor of 2**31+1 size was too expensive to do more than just a few times. Note that while the test case does not run in CI, I did run it locally to ensure it passes with new changes and fails without. Pull Request resolved: https://github.com/pytorch/pytorch/pull/156719 Approved by: https://github.com/albanD
2025-10-20 21:14:14 +08:00 · 2025-06-24 09:51:08 -07:00
parent 310e8361c5
commit 4ee4863232
2 changed files with 12 additions and 3 deletions
--- a/test/test_foreach.py
+++ b/test/test_foreach.py
@ -1357,6 +1357,9 @@ class TestForeach(TestCase):
    def test_foreach_copy_with_multi_dtypes(self, device, dtype, op):
        # check (a) multi_tensor_apply is called and (b) numerical parity with for-loop and Tensor.copy_
        foreach_copy_ = ForeachFuncWrapper(op.inplace_variant)
+
+        tested_large_input = False
+
        for sample in op.sample_inputs(
            device, dtype, noncontiguous=False, allow_higher_dtype_scalars=True
        ):
@ -1364,6 +1367,13 @@ class TestForeach(TestCase):
                if src_dtype == dtype:
                    continue
                self_tensors = [t.clone() for t in sample.input]
+                if not tested_large_input:
+                    # see https://github.com/pytorch/pytorch/issues/156261
+                    self_tensors.append(
+                        torch.empty(2**31 + 1, device=device, dtype=dtype)
+                    )
+                    tested_large_input = True
+
                src_tensors = [t.to(src_dtype) for t in self_tensors]
                out = foreach_copy_(
                    (self_tensors, src_tensors), is_cuda=True, expect_fastpath=True