[1/N] Remove inclusion of ATen/core/Array.h (#122064)

The functionality of Array.h is largely overlapped with std::array and it should be safe to use std::array instead.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/122064
Approved by: https://github.com/ezyang
This commit is contained in:
cyy
2024-11-18 08:50:28 +00:00
committed by PyTorch MergeBot
parent 6c6f745fa7
commit 06dde8c157
5 changed files with 22 additions and 20 deletions

View File

@ -11,6 +11,7 @@
#include <cuda.h>
#endif
#include <array>
#include <ATen/core/Array.h>
#include <c10/macros/Macros.h>
#include <cmath>
@ -21,10 +22,10 @@ namespace at {
// typedefs for holding vector data
namespace detail {
typedef at::detail::Array<uint32_t, 4> UINT4;
typedef at::detail::Array<uint32_t, 2> UINT2;
typedef at::detail::Array<double, 2> DOUBLE2;
typedef at::detail::Array<float, 2> FLOAT2;
typedef std::array<uint32_t, 4> UINT4;
typedef std::array<uint32_t, 2> UINT2;
typedef std::array<double, 2> DOUBLE2;
typedef std::array<float, 2> FLOAT2;
} // namespace detail
@ -79,7 +80,7 @@ public:
uint64_t subsequence = 0) {
key_[0] = static_cast<uint32_t>(seed);
key_[1] = static_cast<uint32_t>(seed >> 32);
counter_ = detail::UINT4(0);
counter_ = detail::UINT4{};
counter_[2] = static_cast<uint32_t>(subsequence);
counter_[3] = static_cast<uint32_t>(subsequence >> 32);
STATE = 0;

View File

@ -296,7 +296,7 @@ void gpu_kernel_impl_nocast(TensorIteratorBase& iter, const func_t& f) {
TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
TORCH_INTERNAL_ASSERT(!needs_dynamic_casting<func_t>::check(iter));
at::detail::Array<char*, ntensors> data;
std::array<char*, ntensors> data;
for (int i = 0; i < ntensors; i++) {
data[i] = (char*)iter.data_ptr(i);
}
@ -313,7 +313,7 @@ void gpu_kernel_impl_nocast(TensorIteratorBase& iter, const func_t& f) {
launch_legacy_kernel<128, unroll_factor>(numel, [=] GPU_LAMBDA(int idx) {
auto offsets = offset_calc.get(idx);
arg0_t* out = (arg0_t*)(data[0] + offsets[0]);
*out = invoke(f, &data.data[1], &offsets.data[1], 1);
*out = invoke(f, &data[1], &offsets[1], 1);
});
}
@ -330,7 +330,7 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
TORCH_INTERNAL_ASSERT(iter.ninputs() == traits::arity);
TORCH_INTERNAL_ASSERT(iter.noutputs() == 1);
at::detail::Array<char*, ntensors> data;
std::array<char*, ntensors> data;
for (int i = 0; i < ntensors; i++) {
data[i] = (char*)iter.data_ptr(i);
}
@ -341,16 +341,16 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
if (contiguous) {
#ifdef USE_ROCM
at::detail::Array<ScalarType, ntensors> dtypes;
std::array<ScalarType, ntensors> dtypes;
auto inner_strides = iter.get_inner_strides();
at::detail::Array<int, ntensors> strides;
std::array<int, ntensors> strides;
for (int i = 0; i < ntensors; i++) {
dtypes[i] = iter.dtype(i);
strides[i] = inner_strides[i];
}
launch_legacy_kernel<512, 1>(numel, [=]GPU_LAMBDA(int idx) {
void* out = data[0] + strides[0] * idx;
arg0_t result = invoke(f, &data.data[1], &strides.data[1], &dtypes.data[1], idx);
arg0_t result = invoke(f, &data[1], &strides[1], &dtypes[1], idx);
c10::cast_and_store<arg0_t>(dtypes[0], out, result);
});
#else
@ -368,7 +368,7 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
storer);
#endif
} else {
at::detail::Array<ScalarType, ntensors> dtypes;
std::array<ScalarType, ntensors> dtypes;
for (int i = 0; i < ntensors; i++) {
dtypes[i] = iter.dtype(i);
}
@ -376,7 +376,7 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
launch_legacy_kernel<128, 4>(numel, [=] GPU_LAMBDA(int idx) {
auto offsets = offset_calc.get(idx);
void* out = data[0] + offsets[0];
arg0_t result = invoke(f, &data.data[1], &offsets.data[1], &dtypes.data[1], 1);
arg0_t result = invoke(f, &data[1], &offsets[1], &dtypes[1], 1);
c10::cast_and_store<arg0_t>(dtypes[0], out, result);
});
}

View File

@ -2,6 +2,7 @@
#include <ATen/native/cuda/IndexKernel.h>
#include <ATen/native/IndexKernel.h>
#include <array>
#include <type_traits>
#include <ATen/core/TensorBase.h>
#include <ATen/Dispatch.h>
@ -68,9 +69,9 @@ void gpu_index_kernel(TensorIteratorBase& iter, const IntArrayRef index_size, co
return;
}
auto sizes = at::detail::Array<int64_t, MAX_DIMS>(0);
auto strides = at::detail::Array<int64_t, MAX_DIMS>(0);
auto index_ptrs = at::detail::Array<char*, MAX_DIMS>(nullptr);
auto sizes = std::array<int64_t, MAX_DIMS>{};
auto strides = std::array<int64_t, MAX_DIMS>{};
auto index_ptrs = std::array<char*, MAX_DIMS>{};
for (unsigned i = 0; i < num_indices; i++) {
sizes[i] = index_size[i];
strides[i] = index_stride[i];

View File

@ -7,8 +7,8 @@
#include <ATen/detail/CUDAHooksInterface.h>
#include <ATen/native/SpectralOpsUtils.h>
#include <array>
#include <cmath>
#include <vector>
namespace at::native {
@ -17,7 +17,7 @@ namespace at::native {
// In mirrored dims, maps linear index i to (n - i) % n
template <typename index_t>
struct HermitianSymmetryOffsetCalculator {
using offset_type = at::detail::Array<index_t, 1>;
using offset_type = std::array<index_t, 1>;
using dim_type = std::remove_cv_t<decltype(MAX_DIMS)>;
dim_type dims;
at::cuda::detail::IntDivider<index_t> sizes_[MAX_DIMS];

View File

@ -1,9 +1,9 @@
#include <array>
#include <gtest/gtest.h>
#include <ATen/ATen.h>
#include <ATen/native/cuda/Loops.cuh>
#include <ATen/native/cuda/MemoryAccess.cuh>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/core/Array.h>
using namespace at::native;
using namespace at::native::memory;
@ -77,7 +77,7 @@ TEST(TestVectorizedMemoryAccess, CanVectorizeUpTo) {
template <typename scalar_t, int vec_size>
__global__ void vectorized_copy(scalar_t *dst, scalar_t *src) {
static_assert(vec_size <= thread_work_size() && thread_work_size() % vec_size == 0, "Invalid vec_size");
using array_t = at::detail::Array<char*, 2>;
using array_t = std::array<char*, 2>;
array_t data;
data[0] = reinterpret_cast<char *>(dst);
data[1] = reinterpret_cast<char *>(src);