Migrate _cat from TH to ATen (CUDA) (#33237)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/24520

Benchmarks:

Upstream:

```
$ python -m pt.cat_test --tag_filter all --device cuda  --omp_num_threads 1 --mkl_num_threads 1
# ----------------------------------------
# PyTorch/Caffe2 Operator Micro-benchmarks
# ----------------------------------------
# Tag : all

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes(1,1,1)_N2_dim0_cuda
# Input: sizes: (1, 1, 1), N: 2, dim: 0, device: cuda
Forward Execution Time (us) : 17.355

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes(512,512,2)_N2_dim1_cuda
# Input: sizes: (512, 512, 2), N: 2, dim: 1, device: cuda
Forward Execution Time (us) : 30.718

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes(128,1024,2)_N2_dim1_cuda
# Input: sizes: (128, 1024, 2), N: 2, dim: 1, device: cuda
Forward Execution Time (us) : 17.329

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes(512,512,2)_N2_dim1_cuda
# Input: sizes: (512, 512, 2), N: 2, dim: 1, device: cuda
Forward Execution Time (us) : 30.176

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes(1024,1024,2)_N2_dim0_cuda
# Input: sizes: (1024, 1024, 2), N: 2, dim: 0, device: cuda
Forward Execution Time (us) : 74.417

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes(1025,1023,2)_N2_dim1_cuda
# Input: sizes: (1025, 1023, 2), N: 2, dim: 1, device: cuda
Forward Execution Time (us) : 75.728

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes(1024,1024,2)_N2_dim2_cuda
# Input: sizes: (1024, 1024, 2), N: 2, dim: 2, device: cuda
Forward Execution Time (us) : 190.165

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes[<function<lambda>at0x7fa8876fcf28>,111,65]_N5_dim0_cuda
# Input: sizes: [<function <lambda> at 0x7fa8876fcf28>, 111, 65], N: 5, dim: 0, device: cuda
Forward Execution Time (us) : 57.711

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes[96,<function<lambda>at0x7fa886237048>,64]_N5_dim1_cuda
# Input: sizes: [96, <function <lambda> at 0x7fa886237048>, 64], N: 5, dim: 1, device: cuda
Forward Execution Time (us) : 49.903

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes[128,64,<function<lambda>at0x7fa7b57bb840>]_N5_dim2_cuda
# Input: sizes: [128, 64, <function <lambda> at 0x7fa7b57bb840>], N: 5, dim: 2, device: cuda
Forward Execution Time (us) : 84.181

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes[<function<lambda>at0x7fa7b57bba60>,32,64]_N50_dim0_cuda
# Input: sizes: [<function <lambda> at 0x7fa7b57bba60>, 32, 64], N: 50, dim: 0, device: cuda
Forward Execution Time (us) : 82.339

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes[32,<function<lambda>at0x7fa7b57bbae8>,64]_N50_dim1_cuda
# Input: sizes: [32, <function <lambda> at 0x7fa7b57bbae8>, 64], N: 50, dim: 1, device: cuda
Forward Execution Time (us) : 82.312

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes[33,65,<function<lambda>at0x7fa7b57bbb70>]_N50_dim2_cuda
# Input: sizes: [33, 65, <function <lambda> at 0x7fa7b57bbb70>], N: 50, dim: 2, device: cuda
Forward Execution Time (us) : 90.715

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes(64,32,4,16,32)_N2_dim2_cuda
# Input: sizes: (64, 32, 4, 16, 32), N: 2, dim: 2, device: cuda
Forward Execution Time (us) : 129.021

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes(16,32,4,16,32)_N8_dim2_cuda
# Input: sizes: (16, 32, 4, 16, 32), N: 8, dim: 2, device: cuda
Forward Execution Time (us) : 142.966

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes(9,31,5,15,33)_N17_dim4_cuda
# Input: sizes: (9, 31, 5, 15, 33), N: 17, dim: 4, device: cuda
Forward Execution Time (us) : 387.023

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes[<function<lambda>at0x7fa7b57bbbf8>]_N100_dim0_cuda
# Input: sizes: [<function <lambda> at 0x7fa7b57bbbf8>], N: 100, dim: 0, device: cuda
Forward Execution Time (us) : 36.647

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes[<function<lambda>at0x7fa7b57bbc80>]_N1000_dim0_cuda
# Input: sizes: [<function <lambda> at 0x7fa7b57bbc80>], N: 1000, dim: 0, device: cuda
Forward Execution Time (us) : 278.890

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes[<function<lambda>at0x7fa7b57bbd08>]_N2000_dim0_cuda
# Input: sizes: [<function <lambda> at 0x7fa7b57bbd08>], N: 2000, dim: 0, device: cuda
Forward Execution Time (us) : 557.752

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes[<function<lambda>at0x7fa7b57bbd90>]_N3000_dim0_cuda
# Input: sizes: [<function <lambda> at 0x7fa7b57bbd90>], N: 3000, dim: 0, device: cuda
Forward Execution Time (us) : 842.512

```

New version:

```
$ python -m pt.cat_test --tag_filter all --device cuda  --omp_num_threads 1 --mkl_num_threads 1
# ----------------------------------------
# PyTorch/Caffe2 Operator Micro-benchmarks
# ----------------------------------------
# Tag : all

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes(1,1,1)_N2_dim0_cuda
# Input: sizes: (1, 1, 1), N: 2, dim: 0, device: cuda
Forward Execution Time (us) : 24.419

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes(512,512,2)_N2_dim1_cuda
# Input: sizes: (512, 512, 2), N: 2, dim: 1, device: cuda
Forward Execution Time (us) : 25.025

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes(128,1024,2)_N2_dim1_cuda
# Input: sizes: (128, 1024, 2), N: 2, dim: 1, device: cuda
Forward Execution Time (us) : 24.247

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes(512,512,2)_N2_dim1_cuda
# Input: sizes: (512, 512, 2), N: 2, dim: 1, device: cuda
Forward Execution Time (us) : 25.098

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes(1024,1024,2)_N2_dim0_cuda
# Input: sizes: (1024, 1024, 2), N: 2, dim: 0, device: cuda
Forward Execution Time (us) : 74.441

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes(1025,1023,2)_N2_dim1_cuda
# Input: sizes: (1025, 1023, 2), N: 2, dim: 1, device: cuda
Forward Execution Time (us) : 74.866

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes(1024,1024,2)_N2_dim2_cuda
# Input: sizes: (1024, 1024, 2), N: 2, dim: 2, device: cuda
Forward Execution Time (us) : 189.280

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes[<function<lambda>at0x7f1c9b056048>,111,65]_N5_dim0_cuda
# Input: sizes: [<function <lambda> at 0x7f1c9b056048>, 111, 65], N: 5, dim: 0, device: cuda
Forward Execution Time (us) : 57.629

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes[96,<function<lambda>at0x7f1c9b0560d0>,64]_N5_dim1_cuda
# Input: sizes: [96, <function <lambda> at 0x7f1c9b0560d0>, 64], N: 5, dim: 1, device: cuda
Forward Execution Time (us) : 49.975

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes[128,64,<function<lambda>at0x7f1bce8f38c8>]_N5_dim2_cuda
# Input: sizes: [128, 64, <function <lambda> at 0x7f1bce8f38c8>], N: 5, dim: 2, device: cuda
Forward Execution Time (us) : 83.643

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes[<function<lambda>at0x7f1bce8f3ae8>,32,64]_N50_dim0_cuda
# Input: sizes: [<function <lambda> at 0x7f1bce8f3ae8>, 32, 64], N: 50, dim: 0, device: cuda
Forward Execution Time (us) : 82.307

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes[32,<function<lambda>at0x7f1bce8f3b70>,64]_N50_dim1_cuda
# Input: sizes: [32, <function <lambda> at 0x7f1bce8f3b70>, 64], N: 50, dim: 1, device: cuda
Forward Execution Time (us) : 82.323

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes[33,65,<function<lambda>at0x7f1bce8f3bf8>]_N50_dim2_cuda
# Input: sizes: [33, 65, <function <lambda> at 0x7f1bce8f3bf8>], N: 50, dim: 2, device: cuda
Forward Execution Time (us) : 90.549

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes(64,32,4,16,32)_N2_dim2_cuda
# Input: sizes: (64, 32, 4, 16, 32), N: 2, dim: 2, device: cuda
Forward Execution Time (us) : 129.022

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes(16,32,4,16,32)_N8_dim2_cuda
# Input: sizes: (16, 32, 4, 16, 32), N: 8, dim: 2, device: cuda
Forward Execution Time (us) : 142.969

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes(9,31,5,15,33)_N17_dim4_cuda
# Input: sizes: (9, 31, 5, 15, 33), N: 17, dim: 4, device: cuda
Forward Execution Time (us) : 386.973

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes[<function<lambda>at0x7f1bce8f3c80>]_N100_dim0_cuda
# Input: sizes: [<function <lambda> at 0x7f1bce8f3c80>], N: 100, dim: 0, device: cuda
Forward Execution Time (us) : 43.800

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes[<function<lambda>at0x7f1bce8f3d08>]_N1000_dim0_cuda
# Input: sizes: [<function <lambda> at 0x7f1bce8f3d08>], N: 1000, dim: 0, device: cuda
Forward Execution Time (us) : 279.023

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes[<function<lambda>at0x7f1bce8f3d90>]_N2000_dim0_cuda
# Input: sizes: [<function <lambda> at 0x7f1bce8f3d90>], N: 2000, dim: 0, device: cuda
Forward Execution Time (us) : 565.790

# Benchmarking PyTorch: cat
# Mode: Eager
# Name: cat_sizes[<function<lambda>at0x7f1bce8f3e18>]_N3000_dim0_cuda
# Input: sizes: [<function <lambda> at 0x7f1bce8f3e18>], N: 3000, dim: 0, device: cuda
Forward Execution Time (us) : 845.153
```
Pull Request resolved: https://github.com/pytorch/pytorch/pull/33237

Differential Revision: D20069181

Pulled By: ngimel

fbshipit-source-id: b392e1ffd72c0d8df0c5a2d3ac96f59b37c84e32
This commit is contained in:
Wojciech Baranowski
2020-02-24 17:39:15 -08:00
committed by Facebook Github Bot
parent 97da60d511
commit b10a39bb32
8 changed files with 433 additions and 331 deletions

View File

@ -1190,19 +1190,3 @@
- THTensor* self - THTensor* self
- THTensor* src - THTensor* src
]] ]]
[[
name: _th_cat
cname: catArray
variants: [function]
backends:
- CUDA
cuda_bool: True
cuda_bfloat16: True
return: self
arguments:
- arg: THTensor* self
output: True
- TensorList tensors
- int64_t dim
]]

View File

@ -74,6 +74,8 @@ bool canUse32BitIndexMath(const Tensor& t, int64_t max_elem) {
int64_t elements = t.numel(); int64_t elements = t.numel();
if (elements >= max_elem) { if (elements >= max_elem) {
return false; return false;
} else if (elements == 0) {
return true;
} }
int64_t offset = 0; int64_t offset = 0;

View File

@ -0,0 +1,360 @@
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/DeviceGuard.h>
#include <ATen/Context.h>
#include <ATen/Utils.h>
#include <ATen/MemoryOverlap.h>
#include <ATen/cuda/detail/IndexUtils.cuh>
#include <ATen/Dispatch.h>
#include <c10/core/TensorImpl.h>
#include <c10/core/ScalarType.h>
#include <c10/core/UndefinedTensorImpl.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <c10/util/typeid.h>
#include <c10/util/intrusive_ptr.h>
#include <THC/THC.h>
namespace at {
namespace native {
constexpr int CAT_ARRAY_BATCH_SIZE = 1024;
constexpr int CAT_ARRAY_MAX_INPUT_DIMS = 4;
namespace {
inline bool getCatGrid(ptrdiff_t nTensors, dim3& grid) {
const int numSM = at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
//X dim of grid for cat array cooperates on a single tensor in the cat.
//Given half of the GPU, full utilization will always occur.
grid = dim3( 2LL * numSM, (long long) nTensors );
return true;
}
// Similar to any other IndexToOffset calculation for copying along a given
// dimension.
template <typename IndexType, int Dims>
struct CatArrIndexToOffset {
static inline __device__ IndexType compute(
const IndexType outputSize[Dims],
const IndexType outputStride[Dims],
const IndexType dimSize,
const unsigned int concatDim,
IndexType linearIndex) {
IndexType offset = 0;
#pragma unroll
for (int i = Dims - 1; i >= 1; --i) {
IndexType curDimSize = i == concatDim ? dimSize : outputSize[i];
IndexType nextDimIndex = linearIndex / curDimSize;
IndexType curDimIndex = linearIndex - curDimSize * nextDimIndex;
IndexType curDimOffset = curDimIndex * outputStride[i];
offset += curDimOffset;
linearIndex = nextDimIndex;
}
return offset + linearIndex * outputStride[0];
}
};
template <typename T, typename IndexType>
struct CatArrInputTensor {
T* input;
IndexType offset;
IndexType dimSize;
IndexType nElements;
};
template<typename IndexType, unsigned int MaxDims>
struct OutputTensorSizeStride {
IndexType outputSize[MaxDims];
IndexType outputStride[MaxDims];
};
/**
* Kernel used to concatenated grimDim.y tensors into an output tensor. Uses a
* grid-stride loop based off of the blockIdx.x, threadIdx.x for each input to
* copy each element from each input tensor into the output.
*
* output: base pointer to the storage associated with the output tensor
* inputs: GPU-allocated array of input metadata for each input to concatenate
* in the kernel
* os: the size/stride vectors for the output tensor
* concatDim: dimension along which we are concatenating
* dimStride: the stride of the output tensor at the concatDim
*
* The most important assumption made is that the input tensors are contiguous.
*/
template <typename T, typename IndexType, int Dims>
#ifdef __HIP_PLATFORM_HCC__
C10_LAUNCH_BOUNDS_1(512)
#endif
__global__ void CatArrayBatchedCopy(
T* output,
CatArrInputTensor<T, IndexType>* inputs,
OutputTensorSizeStride<IndexType, CAT_ARRAY_MAX_INPUT_DIMS> os,
const int concatDim,
IndexType dimStride) {
IndexType tid = blockIdx.x * blockDim.x + threadIdx.x;
IndexType nElements = inputs[blockIdx.y].nElements;
if(tid >= nElements) return;
T* data = inputs[blockIdx.y].input;
IndexType offset = inputs[blockIdx.y].offset;
IndexType dimSize = inputs[blockIdx.y].dimSize;
IndexType dataOffset = offset * dimStride;
IndexType stride = gridDim.x * blockDim.x;
while( tid < nElements){
IndexType elementOffset = CatArrIndexToOffset<IndexType, Dims>::compute(
os.outputSize, os.outputStride, dimSize, concatDim, tid);
output[dataOffset + elementOffset] = data[tid];
tid += stride;
}
}
void check_shape_except_dim(const Tensor &first, const Tensor &second,
int dimension)
{
int first_dims = first.dim();
int second_dims = second.dim();
TORCH_CHECK(first_dims == second_dims,
"Tensors must have same number of dimensions: got ", first_dims,
" and ", second_dims);
for (int dim = 0; dim < first_dims; dim++) {
if (dim == dimension) {
continue;
}
int64_t first_dim_size = at::native::size(first, dim);
int64_t second_dim_size = at::native::size(second, dim);
TORCH_CHECK(first_dim_size == second_dim_size,
"Sizes of tensors must match except in dimension ", dim, ". Got ",
static_cast<long long>(first_dim_size), " and ",
static_cast<long long>(second_dim_size));
}
}
template <typename scalar_t>
void parallel_cat(Tensor &out, const TensorList &inputs, int64_t dimension,
int nDims) {
// First, let's set up our kernel parameters. We start with a raw pointer to
// the storage for the output Tensor.
scalar_t *data = out.data_ptr<scalar_t>();
// Kernel Parameter
long tensorMetadataSize =
sizeof(CatArrInputTensor<scalar_t, unsigned int>) * CAT_ARRAY_BATCH_SIZE;
auto d_inputs_storage = at::empty(
{tensorMetadataSize}, out.options().dtype(at::kByte));
auto d_inputs = static_cast<CatArrInputTensor<scalar_t, unsigned int> *>(
d_inputs_storage.data_ptr());
OutputTensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> param;
// Next, let's initialize the size, stride arrays for the output Tensor.
for (int i = 0; i < nDims; ++i) {
param.outputSize[i] = at::native::size(out, i);
param.outputStride[i] = out.stride(i);
}
at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream();
// Now we loop
int batchCounter = 0;
int64_t offset = 0;
for (int i = 0; i < inputs.size() ; i += CAT_ARRAY_BATCH_SIZE) {
// Re-allocate stackInputs every iteration to avoid read-after-write hazard
{
auto stackInputs_storage = at::empty({tensorMetadataSize},
out.options().dtype(at::kByte).device(at::kCPU).pinned_memory(true));
auto stackInputs =
static_cast<CatArrInputTensor<scalar_t, unsigned int> *>(
stackInputs_storage.data_ptr());
for (batchCounter = 0;
batchCounter < CAT_ARRAY_BATCH_SIZE &&
(i+batchCounter) < inputs.size();
++batchCounter) {
int64_t dimSize = at::native::size(inputs[i+batchCounter], dimension);
stackInputs[batchCounter].input =
inputs[i+batchCounter].data_ptr<scalar_t>();
stackInputs[batchCounter].offset = offset;
stackInputs[batchCounter].dimSize = dimSize;
stackInputs[batchCounter].nElements = inputs[i+batchCounter].numel();
// update offset
offset += dimSize;
}
at::native::copy_(d_inputs_storage, stackInputs_storage,
/* non_blocking= */ true);
}
// Next, let's consider how we set our kernel launch parameters.
// We borrow from THCApply, which the kernel's internal indexing
// is based on.
dim3 applyBlock = dim3(32*16);
//Get grid where x dim fills half gpu and y dim is number of tensors.
//This will have cating two tensors fill the entire grid, but prevent
//many threads from needlessly load meta data if their sizes is small.
dim3 catGrid;
getCatGrid(batchCounter, catGrid);
// Template Declarations for dim = 1, 2, 3, 4
#define HANDLE_CASE(DIMS) \
CatArrayBatchedCopy<scalar_t, unsigned int, DIMS><<<\
catGrid, applyBlock, 0, stream.stream()>>>(\
data, d_inputs, param, dimension, param.outputStride[dimension]);
switch (nDims) {
case 1:
HANDLE_CASE(1);
break;
case 2:
HANDLE_CASE(2);
break;
case 3:
HANDLE_CASE(3);
break;
case 4:
HANDLE_CASE(4);
break;
}
#undef HANDLE_CASE
THCudaCheck(cudaGetLastError());
}
}
} // namespace
Tensor cat_cuda(TensorList inputs, int64_t dimension) {
Tensor out = at::empty({0}, inputs.front().options());
cat_out_cuda(out, inputs, dimension);
return out;
}
Tensor& cat_out_cuda(Tensor& out, TensorList inputs, int64_t dimension) {
// previously, size [0] tensors were the only possible empty tensors; thus, it
// wasn't possible to cat empty tensors unless all the other tensors were
// 1-dimensional, so we allowed these tensors to be "skipped". We maintain
// this behavior for backwards compatibility, but only for this specific size
// (i.e. other empty sizes are not skipped).
// FIXME: warn if this is the case
auto should_skip = [](const Tensor &t) {
return t.dim() == 1 && at::native::size(t, 0) == 0;
};
bool hasSkippedInput = false;
const Tensor *notSkippedTensor = NULL; // non-owning reference
int nDims = 0;
// Inputs cannot alias the output tensor
for (int i = 0; i < inputs.size(); i++) {
auto lap = at::get_overlap_status(out, inputs[i]);
TORCH_CHECK(lap != at::MemOverlapStatus::PARTIAL &&
lap != at::MemOverlapStatus::FULL,
"unsupported operation: the input tensors cannot refer to any "
"of the output memory locations. Found overlap in input "
"tensor ", i);
}
for (int i = 0; i < inputs.size(); i++)
{
if (should_skip(inputs[i])) {
hasSkippedInput = true;
continue;
}
nDims = inputs[i].dim();
notSkippedTensor = &inputs[i];
}
// If all inputs are empty tensors, return an empty tensor
if (notSkippedTensor == NULL) {
return out;
}
TORCH_CHECK(inputs.size() > 0, "invalid number of inputs ", inputs.size());
TORCH_CHECK(dimension >= 0, "invalid dimension ", dimension);
std::vector<int64_t> size(notSkippedTensor->sizes().vec());
// Compute size of the result in the cat dimension
int64_t cat_dim_size = 0;
for (int i = 0; i < inputs.size(); i++) {
const Tensor &tensor = inputs[i];
if (should_skip(tensor)) {
continue;
}
check_shape_except_dim(*notSkippedTensor, tensor, dimension);
cat_dim_size += at::native::size(tensor, dimension);
}
// Compute the size of the result
size[dimension] = cat_dim_size;
out.resize_(size);
if (out.numel() == 0) {
return out;
}
// We parallelize the copy if all 6 conditions pass:
//
// 1. There is more than one input tensor
// 2. No empty inputs
// 3. The out tensor is 32-bit indexable
// 4. The number of dimensions is <= 4
// 5. All input tensors are contiguous (output tensor may be non-contig)
// 6. All input tensors can use 32-bit indexing
// 7. All input tensors are on the same device
const bool all32BitIndexable = std::all_of(inputs.begin(), inputs.end(),
[] (const Tensor& t) {
return at::cuda::detail::canUse32BitIndexMath(t);
});
Device firstDevice = notSkippedTensor->device();
const bool allSameDevice = std::all_of(inputs.begin(), inputs.end(),
[firstDevice](const Tensor& t) {
return t.device() == firstDevice;
});
const bool allContiguous = std::all_of(inputs.begin(), inputs.end(),
[](const Tensor& t) {
return !t.defined() || t.is_contiguous();
});
if (inputs.size() > 1 &&
!hasSkippedInput &&
out.dim() <= CAT_ARRAY_MAX_INPUT_DIMS &&
at::cuda::detail::canUse32BitIndexMath(out) &&
allContiguous &&
all32BitIndexable &&
allSameDevice) {
AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(
at::ScalarType::Half, at::ScalarType::Bool, at::ScalarType::BFloat16,
out.scalar_type(), "cat_cuda", [&]() {
parallel_cat<scalar_t>(out, inputs, dimension, nDims);
});
} else {
int64_t offset = 0;
for (int j = 0; j < inputs.size(); j++)
{
if (should_skip(inputs[j])) continue;
int64_t dimSize = at::native::size(inputs[j], dimension);
Tensor nt = at::narrow(out, dimension, offset, dimSize);
copy_(nt, inputs[j]);
offset += dimSize;
}
}
return out;
}
} // namespace native
} // namespace at

View File

@ -5290,13 +5290,13 @@
- func: _cat(Tensor[] tensors, int dim=0) -> Tensor - func: _cat(Tensor[] tensors, int dim=0) -> Tensor
dispatch: dispatch:
CPU: _cat_cpu CPU: _cat_cpu
CUDA: legacy::cuda::_th_cat CUDA: cat_cuda
QuantizedCPU: quantized_cat QuantizedCPU: quantized_cat
- func: _cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!) - func: _cat.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
dispatch: dispatch:
CPU: _cat_out_cpu CPU: _cat_out_cpu
CUDA: legacy::cuda::_th_cat_out CUDA: cat_out_cuda
QuantizedCPU: quantized_cat_out QuantizedCPU: quantized_cat_out
- func: _mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor, Tensor) - func: _mode(Tensor self, int dim=-1, bool keepdim=False) -> (Tensor, Tensor)

View File

@ -31,111 +31,4 @@ __global__ void THCTensor_copyToDiagonal(T* a, T* b, ptrdiff_t start, ptrdiff_t
} }
} }
#define CAT_ARRAY_BATCH_SIZE 1024
#define CAT_ARRAY_MAX_INPUT_DIMS 4
inline bool getCatGrid(THCState* state, ptrdiff_t nTensors, dim3& grid) {
int curDevice = -1;
cudaGetDevice(&curDevice);
if (curDevice == -1) {
return false;
}
// Assume a reasonable number of SMs if no state is available
int numSM =
state ? at::cuda::getCurrentDeviceProperties()->multiProcessorCount : 15;
//X dim of grid for cat array cooperates on a single tensor in the cat.
//Given half of the GPU, full utilization will always occur.
grid = dim3( 2LL * numSM, (long long) nTensors );
return true;
}
// Similar to any other IndexToOffset calculation for copying along a given dimension.
template <typename IndexType, int Dims>
struct CatArrIndexToOffset {
static inline __device__ IndexType compute(
const IndexType outputSize[Dims],
const IndexType outputStride[Dims],
const IndexType dimSize,
const unsigned int concatDim,
IndexType linearIndex) {
IndexType offset = 0;
#pragma unroll
for (int i = Dims - 1; i >= 1; --i) {
IndexType curDimSize = i == concatDim ? dimSize : outputSize[i];
IndexType nextDimIndex = linearIndex / curDimSize;
IndexType curDimIndex = linearIndex - curDimSize * nextDimIndex;
IndexType curDimOffset = curDimIndex * outputStride[i];
offset += curDimOffset;
linearIndex = nextDimIndex;
}
return offset + linearIndex * outputStride[0];
}
};
template <typename T, typename IndexType>
struct CatArrInputTensor {
T* input;
IndexType offset;
IndexType dimSize;
IndexType nElements;
};
template<typename IndexType, unsigned int MaxDims>
struct OutputTensorSizeStride {
IndexType outputSize[MaxDims];
IndexType outputStride[MaxDims];
};
/**
* Kernel used to concatenated grimDim.y tensors into an output tensor. Uses a grid-stride loop based off of
* the blockIdx.x, threadIdx.x for each input to copy each element from each input tensor into the output.
*
* output: base pointer to the storage associated with the output tensor
* inputs: GPU-allocated array of input metadata for each input to concatenate in the kernel
* os: the size/stride vectors for the output tensor
* concatDim: dimension along which we are concatenating
* dimStride: the stride of the output tensor at the concatDim
*
* The most important assumption made is that the input tensors are contiguous.
*/
template <typename T, typename IndexType, int Dims>
#ifdef __HIP_PLATFORM_HCC__
C10_LAUNCH_BOUNDS_1(512)
#endif
__global__ void CatArrayBatchedCopy(
T* output,
CatArrInputTensor<T, IndexType>* inputs,
OutputTensorSizeStride<IndexType, CAT_ARRAY_MAX_INPUT_DIMS> os,
const int concatDim,
IndexType dimStride) {
IndexType tid = blockIdx.x * blockDim.x + threadIdx.x;
IndexType nElements = inputs[blockIdx.y].nElements;
if(tid >= nElements) return;
T* data = inputs[blockIdx.y].input;
IndexType offset = inputs[blockIdx.y].offset;
IndexType dimSize = inputs[blockIdx.y].dimSize;
IndexType dataOffset = offset * dimStride;
IndexType stride = gridDim.x * blockDim.x;
while( tid < nElements){
IndexType elementOffset = CatArrIndexToOffset<IndexType, Dims>::compute(
os.outputSize, os.outputStride, dimSize, concatDim, tid);
output[dataOffset + elementOffset] = data[tid];
tid += stride;
}
}
#endif #endif

View File

@ -42,15 +42,6 @@ THCTensor_(numel)(THCState *state, THCTensor *t)
return THCTensor_(nElement)(state, t); return THCTensor_(nElement)(state, t);
} }
void THCTensor_(cat)(THCState *state, THCTensor *result,
THCTensor *ta, THCTensor *tb, int dimension)
{
THCTensor* inputs[2];
inputs[0] = ta;
inputs[1] = tb;
THCTensor_(catArray)(state, result, inputs, 2, dimension);
}
void THCTensor_(check_shape_except_dim)(THCState *state, void THCTensor_(check_shape_except_dim)(THCState *state,
THCTensor *first, THCTensor *second, int dimension); THCTensor *first, THCTensor *second, int dimension);
inline void THCTensor_(check_shape_except_dim)(THCState *state, inline void THCTensor_(check_shape_except_dim)(THCState *state,
@ -73,185 +64,6 @@ inline void THCTensor_(check_shape_except_dim)(THCState *state,
} }
} }
void THCTensor_(catArray)(THCState *state, THCTensor *result,
THCTensor **inputs, int numInputs, int dimension)
{
// previously, size [0] tensors were the only possible empty tensors; thus, it wasn't possible
// to cat empty tensors unless all the other tensors were 1-dimensional, so we allowed these tensors
// to be "skipped". We maintain this behavior for backwards compatibility, but only for this specific
// size (i.e. other empty sizes are not skipped).
// FIXME: warn if this is the case
int i, j, cohortMax;
int64_t offset;
bool hasSkippedInput = false;
THCTensor *notSkippedTensor = NULL; // non-owning reference
auto should_skip = [](THCTensor *t) { return t->is_empty() && t->dim() == 1; };
int nDims = 0;
// Inputs cannot alias the output tensor
for (int i = 0; i < numInputs; i++) {
auto lap = at::get_overlap_status(result, inputs[i]);
THArgCheck(lap != at::MemOverlapStatus::PARTIAL &&
lap != at::MemOverlapStatus::FULL, 0,
"unsupported operation: the input tensors cannot refer to any of the "
"output memory locations. Found overlap in input tensor %d.", i);
}
for (i = 0; i < numInputs; i++)
{
if (should_skip(inputs[i])) {
hasSkippedInput = true;
continue;
}
nDims = inputs[i]->dim();
notSkippedTensor = inputs[i];
}
// If all inputs are empty tensors, return an empty tensor
if (notSkippedTensor == NULL) {
return;
}
THArgCheck(numInputs > 0, 3, "invalid number of inputs %d", numInputs);
THArgCheck(dimension >= 0, 4, "invalid dimension %d", dimension);
std::vector<int64_t> size(nDims);
// Compute size of the result in the cat dimension
int64_t cat_dim_size = 0;
for (int i = 0; i < numInputs; i++) {
THCTensor *tensor = inputs[i];
if (should_skip(tensor)) {
continue;
}
THCTensor_(check_shape_except_dim)(state, notSkippedTensor, tensor, dimension);
cat_dim_size += THCTensor_(size)(state, tensor, dimension);
}
// Compute the size of the result
for (int dim = 0; dim < nDims; dim++) {
int64_t result_dim_size = THCTensor_(size)(state, notSkippedTensor, dim);
if (dim == dimension) {
result_dim_size = cat_dim_size;
}
size[dim] = result_dim_size;
}
THCTensor_(resize)(state, result, size, {});
// We parallelize the copy if all 6 conditions pass:
//
// 1. There is more than one input tensor
// 2. No empty inputs
// 3. The result tensor is 32-bit indexable
// 4. The number of dimensions is <= 4
// 5. All input tensors are contiguous (output tensor may be non-contig)
// 6. All input tensors can use 32-bit indexing
// 7. All input tensors are on the same device
if (numInputs > 1 &&
!hasSkippedInput &&
result->dim() <= CAT_ARRAY_MAX_INPUT_DIMS &&
THCTensor_canUse32BitIndexMath(state, result) &&
THCTensor_allContiguous(state, inputs, numInputs) &&
THCTensor_all32BitIndexable(state, inputs, numInputs) &&
THCTensor_allSameDevice(state, inputs, numInputs)) {
// First, let's set up our kernel parameters. We start with a raw pointer to the storage
// for the output Tensor.
scalar_t *data = THCTensor_(data)(state, result);
// Kernel Parameter
size_t tensorMetadataSize = sizeof(CatArrInputTensor<scalar_t, unsigned int>) * CAT_ARRAY_BATCH_SIZE;
auto d_inputs = static_cast<CatArrInputTensor<scalar_t, unsigned int> *>(THCudaMalloc(state, tensorMetadataSize));
OutputTensorSizeStride<unsigned int, CAT_ARRAY_MAX_INPUT_DIMS> param;
// Next, let's initialize the size, stride arrays for the output Tensor.
for (i = 0; i < nDims; ++i) {
param.outputSize[i] = THCTensor_(size)(state, result, i);
param.outputStride[i] = THCTensor_(stride)(state, result, i);
}
at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream();
// Template Declarations for dim = 1, 2, 3, 4
#define HANDLE_CASE(DIMS) \
CatArrayBatchedCopy<scalar_t, unsigned int, DIMS><<<catGrid, applyBlock, 0, stream.stream()>>>(data, d_inputs, param, dimension, param.outputStride[dimension]);
// Now we loop
offset = 0;
for (i = 0; i < numInputs; i += CAT_ARRAY_BATCH_SIZE) {
// Re-allocate stackInputs every iteration to avoid read-after-write hazard
{
auto stackInputs_owner = THCudaHostAlloc(state, tensorMetadataSize);
CatArrInputTensor<scalar_t, unsigned int>* stackInputs = static_cast<CatArrInputTensor<scalar_t, unsigned int>*>(stackInputs_owner.get());
cohortMax = 0;
for (j = 0; j < CAT_ARRAY_BATCH_SIZE && (i+j) < numInputs; ++j) {
int64_t dimSize = THCTensor_(size)(state, inputs[i+j], dimension);
stackInputs[j].input = THCTensor_(data)(state, inputs[i+j]);
stackInputs[j].offset = offset;
stackInputs[j].dimSize = dimSize;
stackInputs[j].nElements = THCTensor_(nElement)(state, inputs[i+j]);
cohortMax = cohortMax > (int) stackInputs[j].nElements ? cohortMax : (int) stackInputs[j].nElements;
// update offset
offset += dimSize;
}
THCudaCheck(cudaMemcpyAsync(
d_inputs,
stackInputs,
j * sizeof(CatArrInputTensor<scalar_t, unsigned int>),
cudaMemcpyHostToDevice,
stream.stream()));
THCudaHostRecord(state, stackInputs);
}
// Next, let's consider how we set our kernel launch parameters.
// We borrow from THCApply, which the kernel's internal indexing
// is based on.
dim3 applyBlock = getApplyBlock();
//Get grid where x dim fills half gpu and y dim is number of tensors.
//This will have cating two tensors fill the entire grid, but prevent
//many threads from needlessly load meta data if their sizes is small.
dim3 catGrid;
getCatGrid(state, j, catGrid);
switch (nDims) {
case 1:
HANDLE_CASE(1);
break;
case 2:
HANDLE_CASE(2);
break;
case 3:
HANDLE_CASE(3);
break;
case 4:
HANDLE_CASE(4);
break;
}
THCudaCheck(cudaGetLastError());
}
THCudaFree(state, d_inputs);
#undef HANDLE_CASE
} else {
offset = 0;
for (j = 0; j < numInputs; j++)
{
if (should_skip(inputs[j])) continue;
int64_t dimSize = THCTensor_(size)(state, inputs[j], dimension);
THCTensor *nt = THCTensor_(newWithTensor)(state, result);
THCTensor_(narrow)(state, nt, NULL, dimension, offset, dimSize);
THCTensor_(copy)(state, nt, inputs[j]);
THCTensor_(free)(state, nt);
offset += dimSize;
}
}
}
void THCTensor_(nonzero)(THCState* state, THCudaLongTensor *tensor, void THCTensor_(nonzero)(THCState* state, THCudaLongTensor *tensor,
THCTensor *self) THCTensor *self)
{ {

View File

@ -4,8 +4,6 @@
THC_API void THCTensor_(fill)(THCState *state, THCTensor *self, scalar_t value); THC_API void THCTensor_(fill)(THCState *state, THCTensor *self, scalar_t value);
THC_API void THCTensor_(zero)(THCState *state, THCTensor *self); THC_API void THCTensor_(zero)(THCState *state, THCTensor *self);
THC_API void THCTensor_(cat)(THCState *state, THCTensor *result, THCTensor *ta, THCTensor *tb, int dimension);
THC_API void THCTensor_(catArray)(THCState *state, THCTensor *result, THCTensor **inputs, int numInputs, int dimension);
THC_API void THCTensor_(nonzero)(THCState* state, THCudaLongTensor *tensor, THCTensor *self); THC_API void THCTensor_(nonzero)(THCState* state, THCudaLongTensor *tensor, THCTensor *self);
THC_API ptrdiff_t THCTensor_(numel)(THCState *state, THCTensor *t); THC_API ptrdiff_t THCTensor_(numel)(THCState *state, THCTensor *t);

View File

@ -5,6 +5,7 @@ from __future__ import unicode_literals
import operator_benchmark as op_bench import operator_benchmark as op_bench
import torch import torch
import random
"""Microbenchmarks for Cat operator""" """Microbenchmarks for Cat operator"""
@ -12,11 +13,11 @@ import torch
# Configs for PT Cat operator # Configs for PT Cat operator
cat_configs_short = op_bench.config_list( cat_configs_short = op_bench.config_list(
attr_names=['M', 'N', 'K', 'dim'], attr_names=['sizes', 'N', 'dim'],
attrs=[ attrs=[
[1, 1, 1, 0], [(1, 1, 1), 2, 0], # noqa
[256, 512, 1, 0], [(512, 512, 2), 2, 1], # noqa
[512, 512, 2, 1], [(128, 1024, 2), 2, 1], # noqa
], ],
cross_product_configs={ cross_product_configs={
'device': ['cpu', 'cuda'], 'device': ['cpu', 'cuda'],
@ -24,29 +25,81 @@ cat_configs_short = op_bench.config_list(
tags=['short'], tags=['short'],
) )
cat_configs_long = op_bench.cross_product_configs( cat_configs_long = op_bench.config_list(
M=[128], attr_names=['sizes', 'N', 'dim'],
N=[128, 1024], attrs=[
K=[1, 2], [(2**10, 2**10, 2), 2, 0], # noqa
dim=[0, 1, 2], [(2**10+1, 2**10-1, 2), 2, 1], # noqa
device=['cpu', 'cuda'], [(2**10, 2**10, 2), 2, 2], # noqa
tags=['long']
[[ lambda: random.randint(2**6, 2**7), 2**7-17, 2**6+1], # noqa
5, 0],
[[ 2**6+2**5, lambda: random.randint(2**6, 2**7), 2**6], # noqa
5, 1],
[[ 2**7, 2**6, lambda: random.randint(2**6, 2**7)], # noqa
5, 2],
[[lambda: random.randint(2**5, 2**6), 2**5, 2**6], # noqa
50, 0],
[[2**5, lambda: random.randint(2**5, 2**6), 2**6], # noqa
50, 1],
[[2**5+1, 2**6+1, lambda: random.randint(2**5, 2**6)], # noqa
50, 2],
],
cross_product_configs={
'device': ['cpu', 'cuda'],
},
tags=['long'],
) )
# There is a different codepath on CUDA for >4 dimensions
cat_configs_multidim = op_bench.config_list(
attr_names=['sizes', 'N', 'dim'],
attrs=[
[(2**6, 2**5, 2**2, 2**4, 2**5), 2, 2], # noqa
[(2**4, 2**5, 2**2, 2**4, 2**5), 8, 2], # noqa
[(2**3+1, 2**5-1, 2**2+1, 2**4-1, 2**5+1), 17, 4], # noqa
],
cross_product_configs={
'device': ['cpu', 'cuda'],
},
tags=['multidim'],
)
cat_configs_manyinputs = op_bench.config_list(
attr_names=['sizes', 'N', 'dim'],
attrs=[
[[lambda: random.randint(1, 10000)], 100, 0],
[[lambda: random.randint(1, 1000)], 1000, 0],
[[lambda: random.randint(1, 500)], 2000, 0],
[[lambda: random.randint(1, 300)], 3000, 0],
],
cross_product_configs={
'device': ['cpu', 'cuda'],
},
tags=['manyinputs'],
)
class CatBenchmark(op_bench.TorchBenchmarkBase): class CatBenchmark(op_bench.TorchBenchmarkBase):
def init(self, M, N, K, dim, device): def init(self, sizes, N, dim, device):
self.input_one = torch.rand(M, N, K, device=device) random.seed(42)
self.inputs = []
for i in range(N):
current_sizes = [old_size() if callable(old_size) else old_size
for old_size in sizes]
self.inputs.append(torch.rand(current_sizes, device=device))
self.dim = dim self.dim = dim
self.set_module_name('cat') self.set_module_name('cat')
def forward(self): def forward(self):
return torch.cat((self.input_one, self.input_one), dim=self.dim) return torch.cat(self.inputs, dim=self.dim)
op_bench.generate_pt_test(cat_configs_short + cat_configs_long, op_bench.generate_pt_test(cat_configs_short +
cat_configs_long +
cat_configs_multidim +
cat_configs_manyinputs,
CatBenchmark) CatBenchmark)
if __name__ == "__main__": if __name__ == "__main__":
op_bench.benchmark_runner.main() op_bench.benchmark_runner.main()