diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp index 7bde58492201..bbd3672412bc 100644 --- a/aten/src/ATen/native/TensorFactories.cpp +++ b/aten/src/ATen/native/TensorFactories.cpp @@ -1322,7 +1322,6 @@ Tensor randn_like( // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ randperm ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ namespace { - template void randperm_cpu(Tensor& result, int64_t n, CPUGeneratorImpl* generator) { scalar_t* r__data = result.data_ptr(); @@ -1330,18 +1329,22 @@ void randperm_cpu(Tensor& result, int64_t n, CPUGeneratorImpl* generator) { result.resize_({n}); int64_t r__stride_0 = result.stride(0); - // we need to pick a number uniformly distributed between 0 and n - // when n is of the same order of magnitude as the biggest number returned by - // random the % result is not uniformly distributed - // so we use random64(), you'd run out of RAM before you - // start seeing the skew - // use no-initialization Fischer-Yates variant - // https://en.wikipedia.org/wiki/Fisher%E2%80%93Yates_shuffle#The_.22inside-out.22_algorithm - for (int64_t i = 0; i < n; i++) { - int64_t z = generator->random64() % (i + 1); - r__data[i * r__stride_0] = i; - r__data[i * r__stride_0] = r__data[z * r__stride_0]; - r__data[z * r__stride_0] = i; + at::parallel_for( + 0, + n, + internal::GRAIN_SIZE, + [&r__data, &r__stride_0](int64_t p_begin, int64_t p_end) { + for (const auto i : c10::irange(p_begin, p_end)) { + r__data[i * r__stride_0] = static_cast(i); + } + }); + + for (int64_t i = 0; i < n - 1; i++) { + // NOLINTNEXTLINE(clang-analyzer-security.insecureAPI.rand) + int64_t z = generator->random() % (n - i); + scalar_t sav = r__data[i * r__stride_0]; + r__data[i * r__stride_0] = r__data[(z + i) * r__stride_0]; + r__data[(z + i) * r__stride_0] = sav; } } } // namespace diff --git a/test/test_dataloader.py b/test/test_dataloader.py index 134fe3e81b7f..cf02325c14a6 100644 --- a/test/test_dataloader.py +++ b/test/test_dataloader.py @@ -246,7 +246,7 @@ class TestDatasetRandomSplit(TestCase): range(10), [3, 7], generator=torch.Generator().manual_seed(1) ) ], - [[8, 4, 2], [0, 7, 5, 3, 6, 9, 1]], + [[5, 6, 1], [2, 0, 8, 9, 3, 7, 4]], ) self.assertEqual( random_split( diff --git a/test/test_sparse_csr.py b/test/test_sparse_csr.py index e95c581e7fc5..d174587b887a 100644 --- a/test/test_sparse_csr.py +++ b/test/test_sparse_csr.py @@ -1956,7 +1956,7 @@ class TestSparseCSR(TestCase): @dtypesIfCUDA(*floating_and_complex_types_and( *[torch.half] if SM53OrLater and TEST_CUSPARSE_GENERIC else [], *[torch.bfloat16] if SM80OrLater and TEST_CUSPARSE_GENERIC else [])) - @precisionOverride({torch.bfloat16: 3.5e-2, torch.float16: 1e-2}) + @precisionOverride({torch.bfloat16: 1e-2, torch.float16: 1e-2}) def test_sparse_addmm(self, device, dtype): def test_shape(m, n, p, nnz, broadcast, index_dtype, alpha_beta=None): if alpha_beta is None: @@ -2617,7 +2617,7 @@ class TestSparseCSR(TestCase): @skipIfTorchDynamo() @onlyCPU @dtypes(torch.float32, torch.float64, torch.bfloat16, torch.float16) - @precisionOverride({torch.bfloat16: 0.02, torch.float16: 0.01}) + @precisionOverride({torch.bfloat16: 0.01, torch.float16: 0.01}) def test_sparse_mm_reduce(self, device, dtype): def run_test(m, n, k, nnz, reduce_type, index_dtype, train): csr = self.genSparseCSRTensor((m, n), nnz, dtype=dtype, device=device, index_dtype=index_dtype) diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py index 5d46f6e1d618..88f87d816f14 100644 --- a/test/test_tensor_creation_ops.py +++ b/test/test_tensor_creation_ops.py @@ -3594,29 +3594,6 @@ class TestRandomTensorCreation(TestCase): self.assertEqual(non_contiguous_tensor, res) self.assertEqual(res.sort().values.long(), torch.arange(n, device=device)) - - @largeTensorTest("10GB", "cpu") - @largeTensorTest("40GB", "cuda") - @slowTest - def test_randperm_large(self, device): - # Test even distribution where rand32 might produce skewed "uniform" distribution - # n_items is chosen to not evenly divide 2**32 and be sufficiently large - # to easily detect skew - def decile(index, collection_size): - return index // (collection_size // 10) - - n_items = 700_000_000 - shuffled = torch.randperm(n_items, device=device) - interval = 1_000_000 - shuffled_interval = shuffled[:interval] - # histogram implemented for float only - deciles = decile(shuffled_interval, shuffled.shape[0]).float().cpu() - hist, _ = deciles.histogram(10, range=(0, 10)) - expected_bin = shuffled_interval.shape[0] / 10 - expected_error = math.sqrt(expected_bin) / expected_bin * 3 - error = (hist - expected_bin).abs().max() / expected_bin - self.assertTrue(error < expected_error, f"error {error} > {expected_error}") - # Test exceptions when device and generator types are incompatible @onlyCUDA @unittest.skipIf(IS_FBCODE or IS_SANDCASTLE, "Produces inconsistent errors when run in fbcode.") diff --git a/test/torch_np/test_random.py b/test/torch_np/test_random.py index 925a60bd0d8d..af00e68cacaf 100644 --- a/test/torch_np/test_random.py +++ b/test/torch_np/test_random.py @@ -87,7 +87,7 @@ class TestShuffle(TestCase): @parametrize("use_numpy", [True, False]) def test_2d(self, use_numpy): # np.shuffle only shuffles the first axis - ax = tnp.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + ax = tnp.asarray([[1, 2, 3], [4, 5, 6]]) ox = ax.copy() tnp.random.seed(1234)