Fix triu_/tril_ overlap handling

2025-10-23 23:04:52 +08:00 · 2025-10-21 07:54:24 -07:00
4 changed files with 29 additions and 0 deletions
--- a/aten/src/ATen/native/TriangularOps.cpp
+++ b/aten/src/ATen/native/TriangularOps.cpp
@ -141,6 +141,8 @@ void compute_triu_tril(const Tensor& self, int64_t k, const Tensor &result) {
    return;
  }

+  checkTrilTriuMemoryOverlap(result, self);
+
  bool inplace_op = self.is_same(result);

  bool inplace_update = false;
--- a/aten/src/ATen/native/TriangularOpsUtils.h
+++ b/aten/src/ATen/native/TriangularOpsUtils.h
@ -1,3 +1,4 @@
+#include <ATen/MemoryOverlap.h>
 #include <ATen/core/Tensor.h>
 #include <ATen/native/LinearAlgebraUtils.h>

@ -54,4 +55,13 @@ static inline std::tuple<bool, Tensor> checkTrilTriuBatchContiguous(const Tensor
  return std::make_tuple(true, tensor);
 }

+static inline void checkTrilTriuMemoryOverlap(const Tensor& result, const Tensor& self) {
+  if (result.is_same(self)) {
+    at::assert_no_internal_overlap(result);
+  } else {
+    at::assert_no_internal_overlap(result);
+    at::assert_no_overlap(result, self);
+  }
+}
+
 }  // namespace at::native
--- a/aten/src/ATen/native/cuda/TriangularOps.cu
+++ b/aten/src/ATen/native/cuda/TriangularOps.cu
@ -5,6 +5,7 @@
 #include <ATen/Dispatch.h>
 #include <ATen/MemoryOverlap.h>
 #include <ATen/native/Resize.h>
+#include <ATen/native/TriangularOpsUtils.h>

 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@ -110,6 +111,8 @@ __global__ void triu_tril_kernel(

 template <bool upper>
 void triu_tril_cuda_template(const Tensor& result, const Tensor& self, int64_t k, const char* name) {
+  checkTrilTriuMemoryOverlap(result, self);
+
  AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4(
      at::ScalarType::ComplexHalf,
      at::ScalarType::Half,
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@ -9986,6 +9986,20 @@ scipy_lobpcg  | {eq_err_scipy:10.2e}  | {eq_err_general_scipy:10.2e}  | {iters2:
        self.assertEqual(result_triu_min, expected_triu_min)
        self.assertEqual(result_tril_min, expected_tril_min)

+    @dtypes(torch.float)
+    def test_triu_tril_inplace_memory_overlap(self, device, dtype):
+        base = torch.rand((), dtype=dtype, device=device)
+        expanded = base.expand(3, 3)
+        msg = (
+            "unsupported operation: more than one element of the written-to tensor "
+            "refers to a single memory location. Please clone() the tensor before "
+            "performing the operation."
+        )
+        with self.assertRaisesRegex(RuntimeError, msg):
+            expanded.triu_(1)
+        with self.assertRaisesRegex(RuntimeError, msg):
+            expanded.tril_(-1)
+
    @dtypes(torch.float, torch.double)
    @precisionOverride({torch.float32: 1e-4})
    def test_1_sized_with_0_strided(self, device, dtype):