[1/N] Remove inclusion of ATen/core/Array.h (#122064)

The functionality of Array.h is largely overlapped with std::array and it should be safe to use std::array instead. Pull Request resolved: https://github.com/pytorch/pytorch/pull/122064 Approved by: https://github.com/ezyang
2025-10-20 21:14:14 +08:00 · 2024-11-18 08:50:28 +00:00
parent 6c6f745fa7
commit 06dde8c157
5 changed files with 22 additions and 20 deletions
--- a/aten/src/ATen/core/PhiloxRNGEngine.h
+++ b/aten/src/ATen/core/PhiloxRNGEngine.h
@ -11,6 +11,7 @@
 #include <cuda.h>
 #endif

+#include <array>
 #include <ATen/core/Array.h>
 #include <c10/macros/Macros.h>
 #include <cmath>
@ -21,10 +22,10 @@ namespace at {
 // typedefs for holding vector data
 namespace detail {

-typedef at::detail::Array<uint32_t, 4> UINT4;
-typedef at::detail::Array<uint32_t, 2> UINT2;
-typedef at::detail::Array<double, 2> DOUBLE2;
-typedef at::detail::Array<float, 2> FLOAT2;
+typedef std::array<uint32_t, 4> UINT4;
+typedef std::array<uint32_t, 2> UINT2;
+typedef std::array<double, 2> DOUBLE2;
+typedef std::array<float, 2> FLOAT2;

 } // namespace detail

@ -79,7 +80,7 @@ public:
                                 uint64_t subsequence = 0) {
    key_[0] = static_cast<uint32_t>(seed);
    key_[1] = static_cast<uint32_t>(seed >> 32);
-    counter_ = detail::UINT4(0);
+    counter_ = detail::UINT4{};
    counter_[2] = static_cast<uint32_t>(subsequence);
    counter_[3] = static_cast<uint32_t>(subsequence >> 32);
    STATE = 0;
--- a/aten/src/ATen/native/cuda/CUDALoops.cuh
+++ b/aten/src/ATen/native/cuda/CUDALoops.cuh
@ -296,7 +296,7 @@ void gpu_kernel_impl_nocast(TensorIteratorBase& iter, const func_t& f) {
  TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
  TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter));

-  at::detail::Array<char*, ntensors> data;
+  std::array<char*, ntensors> data;
  for (int i = 0; i < ntensors; i++) {
    data[i] = (char*)iter.data_ptr(i);
  }
@ -313,7 +313,7 @@ void gpu_kernel_impl_nocast(TensorIteratorBase& iter, const func_t& f) {
  launch_legacy_kernel<128, unroll_factor>(numel, [=] GPU_LAMBDA(int idx) {
    auto offsets = offset_calc.get(idx);
    arg0_t* out = (arg0_t*)(data[0] + offsets[0]);
-    *out = invoke(f, &data.data[1], &offsets.data[1], 1);
+    *out = invoke(f, &data[1], &offsets[1], 1);
  });
 }

@ -330,7 +330,7 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
  TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
  TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);

-  at::detail::Array<char*, ntensors> data;
+  std::array<char*, ntensors> data;
  for (int i = 0; i < ntensors; i++) {
    data[i] = (char*)iter.data_ptr(i);
  }
@ -341,16 +341,16 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {

  if (contiguous) {
 #ifdef USE_ROCM
-    at::detail::Array<ScalarType, ntensors> dtypes;
+    std::array<ScalarType, ntensors> dtypes;
    auto inner_strides = iter.get_inner_strides();
-    at::detail::Array<int, ntensors> strides;
+    std::array<int, ntensors> strides;
    for (int i = 0; i < ntensors; i++) {
      dtypes[i] = iter.dtype(i);
      strides[i] = inner_strides[i];
    }
    launch_legacy_kernel<512, 1>(numel, [=]GPU_LAMBDA(int idx) {
      void* out = data[0] + strides[0] * idx;
-      arg0_t result = invoke(f, &data.data[1], &strides.data[1], &dtypes.data[1], idx);
+      arg0_t result = invoke(f, &data[1], &strides[1], &dtypes[1], idx);
      c10::cast_and_store<arg0_t>(dtypes[0], out, result);
    });
 #else
@ -368,7 +368,7 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
        storer);
 #endif
  } else {
-    at::detail::Array<ScalarType, ntensors> dtypes;
+    std::array<ScalarType, ntensors> dtypes;
    for (int i = 0; i < ntensors; i++) {
      dtypes[i] = iter.dtype(i);
    }
@ -376,7 +376,7 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
    launch_legacy_kernel<128, 4>(numel, [=] GPU_LAMBDA(int idx) {
      auto offsets = offset_calc.get(idx);
      void* out = data[0] + offsets[0];
-      arg0_t result = invoke(f, &data.data[1], &offsets.data[1], &dtypes.data[1], 1);
+      arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1);
      c10::cast_and_store<arg0_t>(dtypes[0], out, result);
    });
  }
--- a/aten/src/ATen/native/cuda/IndexKernel.cu
+++ b/aten/src/ATen/native/cuda/IndexKernel.cu
@ -2,6 +2,7 @@
 #include <ATen/native/cuda/IndexKernel.h>
 #include <ATen/native/IndexKernel.h>

+#include <array>
 #include <type_traits>
 #include <ATen/core/TensorBase.h>
 #include <ATen/Dispatch.h>
@ -68,9 +69,9 @@ void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, co
    return;
  }

-  auto sizes = at::detail::Array<int64_t, MAX_DIMS>(0);
-  auto strides = at::detail::Array<int64_t, MAX_DIMS>(0);
-  auto index_ptrs = at::detail::Array<char*, MAX_DIMS>(nullptr);
+  auto sizes = std::array<int64_t, MAX_DIMS>{};
+  auto strides = std::array<int64_t, MAX_DIMS>{};
+  auto index_ptrs = std::array<char*, MAX_DIMS>{};
  for (unsigned i = 0; i < num_indices; i++) {
    sizes[i] = index_size[i];
    strides[i] = index_stride[i];
--- a/aten/src/ATen/native/cuda/SpectralOps.cu
+++ b/aten/src/ATen/native/cuda/SpectralOps.cu
@ -7,8 +7,8 @@
 #include <ATen/detail/CUDAHooksInterface.h>
 #include <ATen/native/SpectralOpsUtils.h>

+#include <array>
 #include <cmath>
-#include <vector>


 namespace at::native {
@ -17,7 +17,7 @@ namespace at::native {
 // In mirrored dims, maps linear index i to (n - i) % n
 template <typename index_t>
 struct HermitianSymmetryOffsetCalculator {
-  using offset_type = at::detail::Array<index_t, 1>;
+  using offset_type = std::array<index_t, 1>;
  using dim_type = std::remove_cv_t<decltype(MAX_DIMS)>;
  dim_type dims;
  at::cuda::detail::IntDivider<index_t> sizes_[MAX_DIMS];
--- a/aten/src/ATen/test/cuda_vectorized_test.cu
+++ b/aten/src/ATen/test/cuda_vectorized_test.cu
@ -1,9 +1,9 @@
+#include <array>
 #include <gtest/gtest.h>
 #include <ATen/ATen.h>
 #include <ATen/native/cuda/Loops.cuh>
 #include <ATen/native/cuda/MemoryAccess.cuh>
 #include <ATen/cuda/CUDAContext.h>
-#include <ATen/core/Array.h>

 using namespace at::native;
 using namespace at::native::memory;
@ -77,7 +77,7 @@ TEST(TestVectorizedMemoryAccess, CanVectorizeUpTo) {
 template <typename scalar_t, int vec_size>
 __global__ void vectorized_copy(scalar_t *dst, scalar_t *src) {
  static_assert(vec_size <= thread_work_size() && thread_work_size() % vec_size == 0, "Invalid vec_size");
-  using array_t = at::detail::Array<char*, 2>;
+  using array_t = std::array<char*, 2>;
  array_t data;
  data[0] = reinterpret_cast<char *>(dst);
  data[1] = reinterpret_cast<char *>(src);