#include #include #include #include #include "cuda_dlink_extension_add.cuh" __global__ void add_kernel(const float* a, const float* b, float* output, int size) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i < size) { add(a + i, b + i, output + i); } } // output = a * b + c void add_cuda(const float* a, const float* b, float* output, int size) { const int threads = 1024; const int blocks = (size + threads - 1) / threads; add_kernel<<>>(a, b, output, size); C10_CUDA_KERNEL_LAUNCH_CHECK(); }