diff --git a/aten/src/ATen/EmptyTensor.cpp b/aten/src/ATen/EmptyTensor.cpp index 50005d2f5d83..5a72a09d1841 100644 --- a/aten/src/ATen/EmptyTensor.cpp +++ b/aten/src/ATen/EmptyTensor.cpp @@ -2,31 +2,93 @@ #include #include #include +#include + +#include namespace at { namespace detail { - -static c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) { +namespace { +c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) { if (pin_memory) { return at::detail::getCUDAHooks().getPinnedMemoryAllocator(); } return c10::GetCPUAllocator(); } +constexpr uint64_t storage_max() { + // int64_t and size_t are used somewhat inconsistently throughout ATen. + // To be safe, storage size calculations must fit in both types. + constexpr auto int64_max = static_cast( + std::numeric_limits::max()); + constexpr auto size_max = static_cast( + std::numeric_limits::max()); + return std::min(int64_max, size_max); +} + +} // namespace (anonymous) + +size_t computeStorageNbytesContiguous( + IntArrayRef sizes, + size_t itemsize_bytes, + size_t storage_offset + ) { + // Ignore overflow checks on mobile +#ifndef C10_MOBILE + uint64_t size = 1; + bool overflowed = c10::safe_multiplies_u64(sizes, &size); + overflowed |= c10::add_overflows(size, storage_offset, &size); + overflowed |= c10::mul_overflows(size, itemsize_bytes, &size); + overflowed |= size > storage_max(); + TORCH_CHECK(!overflowed, + "Storage size calculation overflowed with sizes=", sizes); + return static_cast(size); +#else + const auto numel = c10::multiply_integers(sizes); + return itemsize_bytes * (storage_offset + numel); +#endif +} + size_t computeStorageNbytes( IntArrayRef sizes, IntArrayRef strides, - size_t itemsize_bytes) { + size_t itemsize_bytes, + size_t storage_offset + ) { + // Ignore overflow checks on mobile +#ifndef C10_MOBILE // size of the underlying storage is 1 bigger than the offset // of the last element according to stride - size_t size = 1; + uint64_t size = storage_offset + 1; + bool overflowed = false; for (const auto i : c10::irange(sizes.size())) { - if(sizes[i] == 0) { + if (sizes[i] == 0) { return 0; } - size += strides[i]*(sizes[i]-1); + + uint64_t strided_size; + overflowed |= c10::mul_overflows(strides[i], sizes[i] - 1, &strided_size); + overflowed |= c10::add_overflows(size, strided_size, &size); } - return size * itemsize_bytes; + overflowed |= c10::mul_overflows(size, itemsize_bytes, &size); + overflowed |= size > storage_max(); + TORCH_CHECK(!overflowed, + "Storage size calculation overflowed with sizes=", + sizes, " and strides=", strides); + return static_cast(size); +#else + // size of the underlying storage is 1 bigger than the offset + // of the last element according to stride + uint64_t size = 1; + for (const auto i : c10::irange(sizes.size())) { + if (sizes[i] == 0) { + return 0; + } + + size += strides[i] * (sizes[i] - 1); + } + return itemsize_bytes * (storage_offset + size); +#endif } TensorBase empty_generic( @@ -37,9 +99,8 @@ TensorBase empty_generic( c10::optional memory_format_opt) { at::detail::check_size_nonnegative(size); - int64_t nelements = c10::multiply_integers(size); caffe2::TypeMeta dtype = scalarTypeToTypeMeta(scalar_type); - int64_t size_bytes = nelements * dtype.itemsize(); + size_t size_bytes = computeStorageNbytesContiguous(size, dtype.itemsize()); auto storage_impl = c10::make_intrusive( c10::StorageImpl::use_byte_size_t(), size_bytes, @@ -73,7 +134,7 @@ TensorBase empty_strided_generic( at::detail::check_size_nonnegative(size); caffe2::TypeMeta dtype = scalarTypeToTypeMeta(scalar_type); - int64_t size_bytes = computeStorageNbytes(size, stride, dtype.itemsize()); + size_t size_bytes = computeStorageNbytes(size, stride, dtype.itemsize()); auto storage_impl = c10::make_intrusive( c10::StorageImpl::use_byte_size_t(), size_bytes, diff --git a/aten/src/ATen/EmptyTensor.h b/aten/src/ATen/EmptyTensor.h index a49b3e909d6e..895bcc8e1779 100644 --- a/aten/src/ATen/EmptyTensor.h +++ b/aten/src/ATen/EmptyTensor.h @@ -10,8 +10,11 @@ inline void check_size_nonnegative(IntArrayRef size) { } } +TORCH_API size_t computeStorageNbytesContiguous( + IntArrayRef sizes, size_t itemsize, size_t storage_offset=0); TORCH_API size_t computeStorageNbytes( - IntArrayRef sizes, IntArrayRef strides, size_t itemsize); + IntArrayRef sizes, IntArrayRef strides, + size_t itemsize, size_t storage_offset=0); TORCH_API TensorBase empty_generic( IntArrayRef size, diff --git a/aten/src/ATen/native/Resize.h b/aten/src/ATen/native/Resize.h index 2d435900db8b..c6fe2b3d2146 100644 --- a/aten/src/ATen/native/Resize.h +++ b/aten/src/ATen/native/Resize.h @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -30,22 +31,16 @@ TORCH_API bool resize_output_check(const Tensor& output, IntArrayRef shape); TORCH_API void resize_bytes_cpu(StorageImpl* storage, size_t size_bytes); -static inline void maybe_resize_storage_cpu(TensorImpl* self, uint64_t new_size) { +static inline void maybe_resize_storage_cpu(TensorImpl* self, size_t new_size_bytes) { // It does not make sense to try to resize a storage // to hold 0 elements, and this can break // if storage_offset is positive but // new_size is 0, so just bail in that case // (same comment is in cuda/Resize.h) - if (new_size == 0) { + if (self->numel() == 0) { return; } - const auto new_size_bytes_i = - (new_size + self->storage_offset()) * self->dtype().itemsize(); - TORCH_CHECK(!overflows(new_size_bytes_i), "Requested storage size (", - new_size_bytes_i, ") cannot be represented as a size_t"); - const auto new_size_bytes = static_cast(new_size_bytes_i); - const Storage& storage = self->unsafe_storage(); if (!storage) { auto new_storage = c10::make_intrusive( @@ -68,15 +63,19 @@ inline TensorImpl* resize_impl_cpu_( return self; } - int64_t storage_size = 1; + const auto itemsize = self->dtype().itemsize(); + const auto storage_offset = self->storage_offset(); + size_t storage_size = 1; if (stride) { self->set_sizes_and_strides(size, *stride); - // NB: storage size can be different from numel. - storage_size = storage_size_for(size, *stride); + storage_size = at::detail::computeStorageNbytes( + size, *stride, itemsize, storage_offset); } else { self->set_sizes_contiguous(size); - storage_size = self->numel(); + storage_size = at::detail::computeStorageNbytesContiguous( + size, itemsize, storage_offset); } + if (resize_storage) { maybe_resize_storage_cpu(self, storage_size); } @@ -158,6 +157,12 @@ inline void setStrided( IntArrayRef stride, int64_t storage_offset) { TORCH_CHECK(size.size() == stride.size(), "mismatch in length of strides and shape"); + for (auto val : stride) { + TORCH_CHECK(val >= 0, + "as_strided: Negative strides are not supported at the moment, " + "got strides: ", stride); + } + auto* self_ = self.unsafeGetTensorImpl(); checkInBoundsForStorage( size, stride, storage_offset, self_->dtype(), self_->storage()); @@ -170,11 +175,6 @@ inline void setStrided( if (self_->sizes() == size && self_->strides() == stride) { return; } - for (auto val : stride) { - TORCH_CHECK(val >= 0, - "as_strided: Negative strides are not supported at the moment, " - "got strides: ", stride); - } self_->set_sizes_and_strides(size, stride); } diff --git a/aten/src/ATen/native/cuda/Resize.h b/aten/src/ATen/native/cuda/Resize.h index acd860cc45b3..68d479307abf 100644 --- a/aten/src/ATen/native/cuda/Resize.h +++ b/aten/src/ATen/native/cuda/Resize.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -9,19 +10,15 @@ namespace at { namespace native { TORCH_CUDA_CPP_API void resize_bytes_cuda(StorageImpl* storage, size_t size_bytes); -static inline void maybe_resize_storage_cuda(TensorImpl* self, uint64_t new_size) { +static inline void maybe_resize_storage_cuda(TensorImpl* self, size_t new_size_bytes) { // It does not make sense to try to resize a storage // to hold 0 elements, and this can break // if storage_offset is positive but // new_size is 0, so just bail in that case // (same comment is in Resize.h) - if (new_size == 0) { + if (self->numel() == 0) { return; } - auto new_size_bytes_i = (new_size + self->storage_offset()) * self->dtype().itemsize(); - TORCH_CHECK(!overflows(new_size_bytes_i), "Requested storage size (", - new_size_bytes_i, ") cannot be represented as a size_t"); - const auto new_size_bytes = static_cast(new_size_bytes_i); const Storage &storage = self->unsafe_storage(); TORCH_CHECK(storage, "Tensor: invalid null storage"); @@ -45,14 +42,17 @@ inline TensorImpl* resize_impl_cuda_( guard.set_index(self->storage().device().index()); } - int64_t storage_size = 1; + const auto itemsize = self->dtype().itemsize(); + const auto storage_offset = self->storage_offset(); + size_t storage_size = 1; if (stride) { self->set_sizes_and_strides(size, *stride); - // NB: storage size can be different from numel. - storage_size = storage_size_for(size, *stride); + storage_size = at::detail::computeStorageNbytes( + size, *stride, itemsize, storage_offset); } else { self->set_sizes_contiguous(size); - storage_size = self->numel(); + storage_size = at::detail::computeStorageNbytesContiguous( + size, itemsize, storage_offset); } maybe_resize_storage_cuda(self, storage_size); diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h index 508665583186..b0c194bc20d1 100644 --- a/c10/core/TensorImpl.h +++ b/c10/core/TensorImpl.h @@ -16,9 +16,11 @@ #include #include #include +#include #include #include +#include #include #include @@ -2266,11 +2268,12 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * Compute the number of elements based on the sizes of a tensor. */ int64_t compute_numel() const { - int64_t n = 1; - for (auto s : sizes()) { - n *= s; - } - return n; +#if C10_HAS_BUILTIN_OVERFLOW() && !defined(C10_MOBILE) + // Use overflow checks if supported by the compiler + return safe_compute_numel(); +#else + return c10::multiply_integers(sizes()); +#endif } /** @@ -2279,14 +2282,15 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target { * using a sparse layout has multiple dimensions with large sizes. */ int64_t safe_compute_numel() const { - int64_t n = 1; - for (auto s : sizes()) { - TORCH_CHECK( - s == 0 || n <= std::numeric_limits::max() / s, - "numel: integer multiplication overflow"); - n *= s; - } - return n; + uint64_t n = 1; + bool overflows = c10::safe_multiplies_u64(sizes(), &n); + constexpr auto numel_max = std::min( + static_cast(std::numeric_limits::max()), + static_cast(std::numeric_limits::max())); + + overflows |= (n > numel_max); + TORCH_CHECK(!overflows, "numel: integer multiplication overflow"); + return static_cast(n); } /** diff --git a/c10/util/safe_numerics.h b/c10/util/safe_numerics.h new file mode 100644 index 000000000000..7eb9ed39395d --- /dev/null +++ b/c10/util/safe_numerics.h @@ -0,0 +1,74 @@ +#pragma once +#include +#include + +#include +#include +#include + +// GCC has __builtin_mul_overflow from before it supported __has_builtin +#ifdef _MSC_VER +#define C10_HAS_BUILTIN_OVERFLOW() (0) +#include +#include +#else +#define C10_HAS_BUILTIN_OVERFLOW() (1) +#endif + +namespace c10 { + +C10_ALWAYS_INLINE bool add_overflows(uint64_t a, uint64_t b, uint64_t* out) { +#if C10_HAS_BUILTIN_OVERFLOW() + return __builtin_add_overflow(a, b, out); +#else + unsigned long long tmp; + auto carry = _addcarry_u64(0, a, b, &tmp); + *out = tmp; + return carry; +#endif +} + +C10_ALWAYS_INLINE bool mul_overflows(uint64_t a, uint64_t b, uint64_t* out) { +#if C10_HAS_BUILTIN_OVERFLOW() + return __builtin_mul_overflow(a, b, out); +#else + *out = a * b; + // This test isnt exact, but avoids doing integer division + return ( + (c10::llvm::countLeadingZeros(a) + c10::llvm::countLeadingZeros(b)) < 64); +#endif +} + +template +bool safe_multiplies_u64(It first, It last, uint64_t* out) { +#if C10_HAS_BUILTIN_OVERFLOW() + uint64_t prod = 1; + bool overflow = false; + for (; first != last; ++first) { + overflow |= c10::mul_overflows(prod, *first, &prod); + } + *out = prod; + return overflow; +#else + uint64_t prod = 1; + uint64_t prod_log2 = 0; + bool is_zero = false; + for (; first != last; ++first) { + auto x = static_cast(*first); + prod *= x; + // log2(0) isn't valid, so need to track it specially + is_zero |= (x == 0); + prod_log2 += c10::llvm::Log2_64_Ceil(x); + } + *out = prod; + // This test isnt exact, but avoids doing integer division + return !is_zero && (prod_log2 >= 64); +#endif +} + +template +bool safe_multiplies_u64(const Container& c, uint64_t* out) { + return safe_multiplies_u64(c.begin(), c.end(), out); +} + +} // namespace c10 diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py index 29dfd12b7fdf..27a91c398b26 100644 --- a/test/test_tensor_creation_ops.py +++ b/test/test_tensor_creation_ops.py @@ -2665,6 +2665,15 @@ class TestTensorCreation(TestCase): y = torch.empty(tuple(size_ones_instead_of_zeros), device=device) self.assertEqual(x.stride(), y.stride()) + @onlyNativeDeviceTypes + def test_empty_overflow(self, device): + with self.assertRaisesRegex(RuntimeError, 'Storage size calculation overflowed'): + torch.empty([2, 4, 2**29, 2**29], dtype=torch.float64) + with self.assertRaisesRegex(RuntimeError, 'Storage size calculation overflowed'): + torch.empty([8, 8, 2**29, 2**29], dtype=torch.float64) + with self.assertRaisesRegex(RuntimeError, 'Storage size calculation overflowed'): + torch.empty_strided([8, 8], [2**61, 1], dtype=torch.float64) + def test_eye(self, device): for dtype in all_types_and_complex_and(torch.half, torch.bool, torch.bfloat16): if dtype == torch.bfloat16: diff --git a/test/test_view_ops.py b/test/test_view_ops.py index 71d273d6505b..064d001727ab 100644 --- a/test/test_view_ops.py +++ b/test/test_view_ops.py @@ -1795,6 +1795,14 @@ class TestOldViewOps(TestCase): x.resize_as_(y) self.assertEqual(y.shape, x.shape) + @onlyNativeDeviceTypes + def test_resize_overflow(self, device): + x = torch.empty((), dtype=torch.float64) + with self.assertRaisesRegex(RuntimeError, 'Storage size calculation overflowed'): + x.resize_([2, 4, 2**29, 2**29]) + with self.assertRaisesRegex(RuntimeError, 'overflow'): + x.resize_([8, 8, 2**29, 2**29]) + def test_view_all_dtypes_and_devices(self, device): for dt in all_types_and_complex_and(torch.half, torch.bfloat16, torch.bool): x = torch.tensor([[1, 2], [3, 4], [5, 6]], dtype=dt, device=device)