mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Drop unused variables; make things const; use some auto (#71107)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/71107 Test Plan: Sandcastle Reviewed By: ngimel Differential Revision: D33490773 fbshipit-source-id: 0d259db9c58c9b33aecc560075f6dcfa78883467
This commit is contained in:
committed by
Facebook GitHub Bot
parent
3c2ae2b47c
commit
cf61738097
@ -133,9 +133,9 @@ TORCH_API void record_kernel_function_dtype(std::string name);
|
||||
const auto& SCALAR_TYPE C10_UNUSED_DISPATCH_CUDA_WORKAROUND = enum_type; \
|
||||
const auto& UNDERLYING_TYPE C10_UNUSED_DISPATCH_CUDA_WORKAROUND = \
|
||||
toUnderlying(enum_type); \
|
||||
int bit_width = bitwidth; \
|
||||
int64_t quant_min = qmin; \
|
||||
int64_t quant_max = qmax; \
|
||||
C10_UNUSED int bit_width = bitwidth; \
|
||||
C10_UNUSED int64_t quant_min = qmin; \
|
||||
C10_UNUSED int64_t quant_max = qmax; \
|
||||
(void)bit_width; /* Suppress unused variable warning */ \
|
||||
(void)quant_min; /* Suppress unused variable warning */ \
|
||||
(void)quant_max; /* Suppress unused variable warning */ \
|
||||
|
@ -6,21 +6,21 @@ namespace caffe2 {
|
||||
namespace {
|
||||
template <typename T>
|
||||
__global__ void LRNFillScaleNCHW(const int nthreads, const T* in,
|
||||
const int num, const int channels, const int height,
|
||||
const int channels, const int height,
|
||||
const int width, const int size, const T alpha_over_size,
|
||||
const T bias, T* scale) {
|
||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
||||
// find out the local offset
|
||||
int w = index % width;
|
||||
int h = (index / width) % height;
|
||||
int n = index / width / height;
|
||||
int offset = (n * channels * height + h) * width + w;
|
||||
int step = height * width;
|
||||
const int w = index % width;
|
||||
const int h = (index / width) % height;
|
||||
const int n = index / width / height;
|
||||
const int offset = (n * channels * height + h) * width + w;
|
||||
const int step = height * width;
|
||||
in += offset;
|
||||
scale += offset;
|
||||
int head = 0;
|
||||
int pre_pad = (size - 1) / 2;
|
||||
int post_pad = size - pre_pad - 1;
|
||||
const int pre_pad = (size - 1) / 2;
|
||||
const int post_pad = size - pre_pad - 1;
|
||||
T accum_scale = 0;
|
||||
// fill the scale at [n, :, h, w]
|
||||
// accumulate values
|
||||
@ -54,16 +54,16 @@ __global__ void LRNFillScaleNCHW(const int nthreads, const T* in,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void LRNFillScaleNHWC(const int nthreads, const T* in,
|
||||
const int num, const int height, const int width,
|
||||
__global__ void LRNFillScaleNHWC(const int nthreads, const T *const in,
|
||||
const int height, const int width,
|
||||
const int channels, const int size, const T alpha_over_size,
|
||||
const T bias, T* scale) {
|
||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
||||
int c = index % channels;
|
||||
int pre_pad = (size - 1) / 2;
|
||||
const int c = index % channels;
|
||||
const int pre_pad = (size - 1) / 2;
|
||||
scale[index] = 0;
|
||||
for (int i = 0; i < size; ++i) {
|
||||
int raw_idx = c + i - pre_pad;
|
||||
const int raw_idx = c + i - pre_pad;
|
||||
if (raw_idx >= 0 && raw_idx < channels) {
|
||||
scale[index] += in[index + i - pre_pad] * in[index + i - pre_pad];
|
||||
}
|
||||
@ -85,17 +85,17 @@ __global__ void LRNComputeOutput(const int nthreads, const T* in,
|
||||
template <typename T>
|
||||
__global__ void LRNComputeDiffNCHW(const int nthreads, const T* bottom_data,
|
||||
const T* top_data, const T* scale, const T* top_diff,
|
||||
const int num, const int channels, const int height,
|
||||
const int channels, const int height,
|
||||
const int width, const int size, const T negative_beta,
|
||||
const T cache_ratio,
|
||||
T* bottom_diff) {
|
||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
||||
// find out the local offset
|
||||
int w = index % width;
|
||||
int h = (index / width) % height;
|
||||
int n = index / width / height;
|
||||
int offset = (n * channels * height + h) * width + w;
|
||||
int step = height * width;
|
||||
const int w = index % width;
|
||||
const int h = (index / width) % height;
|
||||
const int n = index / width / height;
|
||||
const int offset = (n * channels * height + h) * width + w;
|
||||
const int step = height * width;
|
||||
bottom_data += offset;
|
||||
top_data += offset;
|
||||
scale += offset;
|
||||
@ -155,13 +155,13 @@ __global__ void LRNComputeDiffNCHW(const int nthreads, const T* bottom_data,
|
||||
template <typename T>
|
||||
__global__ void LRNComputeDiffNHWC(const int nthreads, const T* bottom_data,
|
||||
const T* top_data, const T* scale, const T* top_diff,
|
||||
const int num, const int height, const int width, const int channels,
|
||||
const int height, const int width, const int channels,
|
||||
const int size, const T negative_beta, const T cache_ratio,
|
||||
T* bottom_diff) {
|
||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
||||
// find out the local channel offset
|
||||
int c = index % channels;
|
||||
int pre_pad = size / 2;
|
||||
const int c = index % channels;
|
||||
const int pre_pad = size / 2;
|
||||
T accum_ratio = 0;
|
||||
for (int i = -pre_pad; i < size - pre_pad; ++i) {
|
||||
if (c + i >= 0 && c + i < channels) {
|
||||
@ -200,7 +200,7 @@ bool LRNOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
|
||||
int n_threads = N * H * W;
|
||||
LRNFillScaleNCHW<float><<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
n_threads, Xdata, N, C, H, W, size_, alpha_ / size_, bias_, scale_data);
|
||||
n_threads, Xdata, C, H, W, size_, alpha_ / size_, bias_, scale_data);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
n_threads = X.numel();
|
||||
@ -237,7 +237,7 @@ bool LRNOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
|
||||
int n_threads = X.numel();
|
||||
LRNFillScaleNHWC<float><<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
n_threads, Xdata, N, H, W, C, size_, alpha_ / size_, bias_, scale_data);
|
||||
n_threads, Xdata, H, W, C, size_, alpha_ / size_, bias_, scale_data);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
LRNComputeOutput<float><<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS,
|
||||
@ -271,20 +271,20 @@ bool LRNGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
|
||||
scale_ = &local_scale_tensor_;
|
||||
}
|
||||
scale_->ResizeLike(X);
|
||||
float* scale_data = scale_->template mutable_data<float>();
|
||||
int n_threads = N * H * W;
|
||||
float *const scale_data = scale_->template mutable_data<float>();
|
||||
const int n_threads = N * H * W;
|
||||
LRNFillScaleNCHW<float><<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
n_threads, Xdata, N, C, H, W, size_, alpha_ / size_, bias_, scale_data);
|
||||
n_threads, Xdata, C, H, W, size_, alpha_ / size_, bias_, scale_data);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
const float* dYdata = dY.data<float>();
|
||||
float* dXdata = dX->template mutable_data<float>();
|
||||
const float *const dYdata = dY.data<float>();
|
||||
float *const dXdata = dX->template mutable_data<float>();
|
||||
|
||||
LRNComputeDiffNCHW<float><<<CAFFE_GET_BLOCKS(n_threads),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
n_threads, Xdata, Ydata, scale_data, dYdata, N, C, H, W, size_, -beta_,
|
||||
n_threads, Xdata, Ydata, scale_data, dYdata, C, H, W, size_, -beta_,
|
||||
2.f * alpha_ * beta_ / size_, dXdata);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
@ -317,7 +317,7 @@ bool LRNGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
|
||||
int n_threads = X.numel();
|
||||
LRNFillScaleNHWC<float><<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS,
|
||||
0, context_.cuda_stream()>>>(
|
||||
n_threads, Xdata, N, H, W, C, size_, alpha_ / size_, bias_, scale_data);
|
||||
n_threads, Xdata, H, W, C, size_, alpha_ / size_, bias_, scale_data);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
LRNComputeDiffNHWC<float>
|
||||
@ -330,7 +330,6 @@ bool LRNGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
|
||||
Y.data<float>(),
|
||||
scale_data,
|
||||
dY.data<float>(),
|
||||
X.dim32(0),
|
||||
X.dim32(1),
|
||||
X.dim32(2),
|
||||
X.dim32(3),
|
||||
|
@ -19,8 +19,7 @@ using c10::cuda::compat::pow;
|
||||
template <typename T>
|
||||
__global__ void LpPoolForwardNCHW(
|
||||
const int nthreads,
|
||||
const T* bottom_data,
|
||||
const int num,
|
||||
const T *const bottom_data,
|
||||
const int channels,
|
||||
const int height,
|
||||
const int width,
|
||||
@ -32,7 +31,7 @@ __global__ void LpPoolForwardNCHW(
|
||||
const int stride_w,
|
||||
const int pad_t,
|
||||
const int pad_l,
|
||||
T* top_data,
|
||||
T *const top_data,
|
||||
const T p) {
|
||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
||||
int n = index;
|
||||
@ -64,7 +63,6 @@ template <typename T>
|
||||
__global__ void LpPoolForwardNHWC(
|
||||
const int nthreads,
|
||||
const T* bottom_data,
|
||||
const int num,
|
||||
const int height,
|
||||
const int width,
|
||||
const int channels,
|
||||
@ -107,7 +105,6 @@ __global__ void LpPoolBackwardNCHW(
|
||||
const T* const top_diff,
|
||||
const T* const top_data,
|
||||
const T* const bottom_data,
|
||||
const int num,
|
||||
const int channels,
|
||||
const int height,
|
||||
const int width,
|
||||
@ -143,8 +140,6 @@ __global__ void LpPoolBackwardNCHW(
|
||||
// figure out the pooling size
|
||||
int hstart = ph * stride_h - pad_t;
|
||||
int wstart = pw * stride_w - pad_l;
|
||||
int hend = min(hstart + kernel_h, height);
|
||||
int wend = min(wstart + kernel_w, width);
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
gradient += top_diff_slice[ph * pooled_width + pw] *
|
||||
@ -162,7 +157,6 @@ __global__ void LpPoolBackwardNHWC(
|
||||
const T* const top_diff,
|
||||
const T* const top_data,
|
||||
const T* const bottom_data,
|
||||
const int num,
|
||||
const int height,
|
||||
const int width,
|
||||
const int channels,
|
||||
@ -195,12 +189,8 @@ __global__ void LpPoolBackwardNHWC(
|
||||
for (int ph = phstart; ph < phend; ++ph) {
|
||||
for (int pw = pwstart; pw < pwend; ++pw) {
|
||||
// figure out the pooling size
|
||||
int hstart = ph * stride_h - pad_t;
|
||||
int wstart = pw * stride_w - pad_l;
|
||||
int hend = min(hstart + kernel_h, height);
|
||||
int wend = min(wstart + kernel_w, width);
|
||||
hstart = max(hstart, 0);
|
||||
wstart = max(wstart, 0);
|
||||
const int hstart = max(ph * stride_h - pad_t, 0);
|
||||
const int wstart = max(pw * stride_w - pad_l, 0);
|
||||
gradient += top_diff_slice[(ph * pooled_width + pw) * channels] *
|
||||
bottom_data[index] * pow(abs(bottom_data[index]), p - 2) /
|
||||
pow(top_data_slice[(ph * pooled_width + pw) * channels], p - 1);
|
||||
@ -225,7 +215,6 @@ bool PoolOp<float, CUDAContext, LpPoolFunctor>::RunOnDeviceWithOrderNCHW() {
|
||||
context_.cuda_stream()>>>(
|
||||
output_size,
|
||||
X.data<float>(),
|
||||
X.dim32(0),
|
||||
X.dim32(1),
|
||||
X.dim32(2),
|
||||
X.dim32(3),
|
||||
@ -257,7 +246,6 @@ bool PoolOp<float, CUDAContext, LpPoolFunctor>::RunOnDeviceWithOrderNHWC() {
|
||||
context_.cuda_stream()>>>(
|
||||
output_size,
|
||||
X.data<float>(),
|
||||
X.dim32(0),
|
||||
X.dim32(1),
|
||||
X.dim32(2),
|
||||
X.dim32(3),
|
||||
@ -295,7 +283,6 @@ bool PoolGradientOp<float, CUDAContext, LpPoolFunctor>::
|
||||
dY.data<float>(),
|
||||
Y.data<float>(),
|
||||
X.data<float>(),
|
||||
X.dim32(0),
|
||||
X.dim32(1),
|
||||
X.dim32(2),
|
||||
X.dim32(3),
|
||||
@ -333,7 +320,6 @@ bool PoolGradientOp<float, CUDAContext, LpPoolFunctor>::
|
||||
dY.data<float>(),
|
||||
Y.data<float>(),
|
||||
X.data<float>(),
|
||||
X.dim32(0),
|
||||
X.dim32(1),
|
||||
X.dim32(2),
|
||||
X.dim32(3),
|
||||
|
@ -124,16 +124,15 @@ __global__ void PReluWGradientKernelNCHW(
|
||||
template <typename T>
|
||||
__global__ void PReluWGradientKernelNHWC(
|
||||
const int C,
|
||||
const int N,
|
||||
const int num_items,
|
||||
const T* Xdata,
|
||||
const T* dYdata,
|
||||
T* dW) {
|
||||
int c = blockIdx.x;
|
||||
const auto c = blockIdx.x;
|
||||
T wsum = 0.0;
|
||||
int items_per_channel = num_items / C;
|
||||
const auto items_per_channel = num_items / C;
|
||||
for (int i = threadIdx.x; i < items_per_channel; i += blockDim.x) {
|
||||
int ii = i * C + c;
|
||||
const auto ii = i * C + c;
|
||||
wsum += (Xdata[ii] <= 0) * dYdata[ii] * Xdata[ii];
|
||||
}
|
||||
|
||||
@ -270,7 +269,7 @@ bool PReluGradientOp<float, CUDAContext>::RunOnDevice() {
|
||||
C,
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(C, N, X.numel(), Xdata, dYdata, dWdata);
|
||||
context_.cuda_stream()>>>(C, X.numel(), Xdata, dYdata, dWdata);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
|
||||
PReluGradientKernelNHWC<<<
|
||||
|
@ -80,8 +80,6 @@ __global__ void ROIPoolBackward(
|
||||
const int nthreads,
|
||||
const T* top_diff,
|
||||
const int* argmax_data,
|
||||
const int num_rois,
|
||||
const T spatial_scale,
|
||||
const int channels,
|
||||
const int height,
|
||||
const int width,
|
||||
@ -91,10 +89,10 @@ __global__ void ROIPoolBackward(
|
||||
const T* bottom_rois) {
|
||||
CUDA_1D_KERNEL_LOOP(index, nthreads) {
|
||||
// (n, c, ph, pw) is an element in the pooled output
|
||||
int pw = index % pooled_width;
|
||||
int ph = (index / pooled_width) % pooled_height;
|
||||
int c = (index / pooled_width / pooled_height) % channels;
|
||||
int n = index / pooled_width / pooled_height / channels;
|
||||
const int pw = index % pooled_width;
|
||||
const int ph = (index / pooled_width) % pooled_height;
|
||||
const int c = (index / pooled_width / pooled_height) % channels;
|
||||
const int n = index / pooled_width / pooled_height / channels;
|
||||
|
||||
const T* offset_bottom_rois = bottom_rois + n * 5;
|
||||
int roi_batch_ind = offset_bottom_rois[0];
|
||||
@ -184,8 +182,6 @@ C10_EXPORT bool RoIPoolGradientOp<float, CUDAContext>::RunOnDevice() {
|
||||
dY.numel(),
|
||||
dY.data<float>(),
|
||||
A.data<int>(),
|
||||
R.dim32(0),
|
||||
spatial_scale_,
|
||||
X.dim32(1),
|
||||
X.dim32(2),
|
||||
X.dim32(3),
|
||||
|
@ -49,8 +49,7 @@ __global__ void length_sum_kernel(
|
||||
T* __restrict__ out,
|
||||
const int* __restrict__ prefix_sum_length_data,
|
||||
int N,
|
||||
int post,
|
||||
int len_length) {
|
||||
int post) {
|
||||
// len_length blocks
|
||||
int group = blockIdx.x;
|
||||
|
||||
@ -177,12 +176,11 @@ C10_LAUNCH_BOUNDS_2(1024, SEGREDUCE_MINBLOCKS)
|
||||
#endif
|
||||
__global__ void length_weighted_sum_gradient_kernel(
|
||||
const T* __restrict__ grad_in,
|
||||
const T* __restrict__ weights_in,
|
||||
const T *const __restrict__ weights_in,
|
||||
T* __restrict__ grad_out,
|
||||
const int* __restrict__ prefix_sum_length_data,
|
||||
int N,
|
||||
int post,
|
||||
int len_length) {
|
||||
const int *const __restrict__ prefix_sum_length_data,
|
||||
const int N,
|
||||
const int post) {
|
||||
// len_length blocks
|
||||
int group = blockIdx.x;
|
||||
|
||||
@ -213,21 +211,20 @@ template <typename T, typename IndexType, int NumThreads>
|
||||
C10_LAUNCH_BOUNDS_2(1024, SEGREDUCE_MINBLOCKS)
|
||||
#endif
|
||||
__global__ void length_weighted_sum_with_main_input_gradient_kernel(
|
||||
const T* __restrict__ grad_in,
|
||||
const T* __restrict__ weights_in,
|
||||
const T* __restrict__ data_in,
|
||||
const IndexType* __restrict__ indices,
|
||||
T* __restrict__ data_grad_out,
|
||||
T* __restrict__ weights_grad_out,
|
||||
const int* __restrict__ prefix_sum_length_data,
|
||||
int N,
|
||||
int post,
|
||||
int len_length) {
|
||||
const T *const __restrict__ grad_in,
|
||||
const T *const __restrict__ weights_in,
|
||||
const T *const __restrict__ data_in,
|
||||
const IndexType *const __restrict__ indices,
|
||||
T *const __restrict__ data_grad_out,
|
||||
T *const __restrict__ weights_grad_out,
|
||||
const int *const __restrict__ prefix_sum_length_data,
|
||||
const int N,
|
||||
const int post) {
|
||||
// len_length blocks
|
||||
int group = blockIdx.x;
|
||||
const int group = blockIdx.x;
|
||||
|
||||
int start = group == 0 ? 0 : prefix_sum_length_data[group - 1];
|
||||
int end = prefix_sum_length_data[group];
|
||||
const int start = group == 0 ? 0 : prefix_sum_length_data[group - 1];
|
||||
const int end = prefix_sum_length_data[group];
|
||||
CUDA_KERNEL_ASSERT(start <= N);
|
||||
CUDA_KERNEL_ASSERT(end <= N);
|
||||
|
||||
@ -257,19 +254,17 @@ C10_LAUNCH_BOUNDS_2(1024, SEGREDUCE_MINBLOCKS)
|
||||
#endif
|
||||
__global__ void sparse_length_max_kernel(
|
||||
const T* __restrict__ in,
|
||||
T* __restrict__ out,
|
||||
const int* __restrict__ prefix_sum_length_data,
|
||||
const IndexType* __restrict__ indices,
|
||||
int N,
|
||||
int post,
|
||||
int len_length,
|
||||
int len_indices,
|
||||
T *const __restrict__ out,
|
||||
const int *const __restrict__ prefix_sum_length_data,
|
||||
const IndexType *const __restrict__ indices,
|
||||
const int post,
|
||||
const int len_indices,
|
||||
const T numeric_min) {
|
||||
// len_length blocks
|
||||
int group = blockIdx.x;
|
||||
const int group = blockIdx.x;
|
||||
|
||||
int start = group == 0 ? 0 : prefix_sum_length_data[group - 1];
|
||||
int end = prefix_sum_length_data[group];
|
||||
const int start = group == 0 ? 0 : prefix_sum_length_data[group - 1];
|
||||
const int end = prefix_sum_length_data[group];
|
||||
CUDA_KERNEL_ASSERT(start <= len_indices);
|
||||
CUDA_KERNEL_ASSERT(end <= len_indices);
|
||||
|
||||
@ -487,12 +482,12 @@ class CUDASparseLengthsSumOp : public Operator<CUDAContext> {
|
||||
if (post <= maxThreads) {
|
||||
length_sum_kernel<T, true, false>
|
||||
<<<len_length, post, 0, context_.cuda_stream()>>>(
|
||||
in_data, out_data, prefix_sum_length_data, N, post, len_length);
|
||||
in_data, out_data, prefix_sum_length_data, N, post);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
} else {
|
||||
length_sum_kernel<T, true, false>
|
||||
<<<len_length, maxThreads, 0, context_.cuda_stream()>>>(
|
||||
in_data, out_data, prefix_sum_length_data, N, post, len_length);
|
||||
in_data, out_data, prefix_sum_length_data, N, post);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}
|
||||
}
|
||||
@ -625,13 +620,13 @@ class CUDASparseLengthsMeanOp : public Operator<CUDAContext> {
|
||||
// calling cuda kernel with ExactBlock = true, Average = true
|
||||
length_sum_kernel<T, true, true>
|
||||
<<<len_length, post, 0, context_.cuda_stream()>>>(
|
||||
in_data, out_data, prefix_sum_length_data, N, post, len_length);
|
||||
in_data, out_data, prefix_sum_length_data, N, post);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
} else {
|
||||
// calling cuda kernel with ExactBlock = true, Average = true
|
||||
length_sum_kernel<T, true, true>
|
||||
<<<len_length, maxThreads, 0, context_.cuda_stream()>>>(
|
||||
in_data, out_data, prefix_sum_length_data, N, post, len_length);
|
||||
in_data, out_data, prefix_sum_length_data, N, post);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}
|
||||
}
|
||||
@ -719,9 +714,9 @@ class CUDASparseLengthsMaxOp : public Operator<CUDAContext> {
|
||||
T numeric_min = std::numeric_limits<T>::min();
|
||||
if (SparseFused) {
|
||||
if (post <= maxThreads) {
|
||||
int multiple = std::min(maxThreads / post, SEGREDUCE_MINBLOCKS);
|
||||
dim3 block(post, multiple);
|
||||
size_t smem = sizeof(T) * post * multiple;
|
||||
const int multiple = std::min(maxThreads / post, SEGREDUCE_MINBLOCKS);
|
||||
const dim3 block(post, multiple);
|
||||
const size_t smem = sizeof(T) * post * multiple;
|
||||
|
||||
sparse_length_max_kernel<T, IndexType, true>
|
||||
<<<len_length, block, smem, context_.cuda_stream()>>>(
|
||||
@ -729,9 +724,7 @@ class CUDASparseLengthsMaxOp : public Operator<CUDAContext> {
|
||||
out_data,
|
||||
prefix_sum_length_data,
|
||||
indices,
|
||||
N,
|
||||
post,
|
||||
len_length,
|
||||
dataToReduceSize,
|
||||
numeric_min);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
@ -742,9 +735,7 @@ class CUDASparseLengthsMaxOp : public Operator<CUDAContext> {
|
||||
out_data,
|
||||
prefix_sum_length_data,
|
||||
indices,
|
||||
N,
|
||||
post,
|
||||
len_length,
|
||||
dataToReduceSize,
|
||||
numeric_min);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
@ -1528,8 +1519,7 @@ class CUDASparseLengthsWeightedSumGradientWithIndicesOp
|
||||
out_data,
|
||||
prefix_sum_length_data,
|
||||
N,
|
||||
post,
|
||||
len_length);
|
||||
post);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
} else {
|
||||
length_weighted_sum_gradient_kernel<T, false>
|
||||
@ -1539,8 +1529,7 @@ class CUDASparseLengthsWeightedSumGradientWithIndicesOp
|
||||
out_data,
|
||||
prefix_sum_length_data,
|
||||
N,
|
||||
post,
|
||||
len_length);
|
||||
post);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}
|
||||
|
||||
@ -1768,8 +1757,7 @@ class CUDASparseLengthsIndicesInGradientWeightedSumWithMainInputGradientOp
|
||||
out_weight_grads,
|
||||
prefix_sum_length_data,
|
||||
N,
|
||||
post,
|
||||
len_length);
|
||||
post);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
} else if (post > 64) {
|
||||
length_weighted_sum_with_main_input_gradient_kernel<T, IndexType, 128>
|
||||
@ -1782,8 +1770,7 @@ class CUDASparseLengthsIndicesInGradientWeightedSumWithMainInputGradientOp
|
||||
out_weight_grads,
|
||||
prefix_sum_length_data,
|
||||
N,
|
||||
post,
|
||||
len_length);
|
||||
post);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
} else if (post > 32) {
|
||||
length_weighted_sum_with_main_input_gradient_kernel<T, IndexType, 64>
|
||||
@ -1796,8 +1783,7 @@ class CUDASparseLengthsIndicesInGradientWeightedSumWithMainInputGradientOp
|
||||
out_weight_grads,
|
||||
prefix_sum_length_data,
|
||||
N,
|
||||
post,
|
||||
len_length);
|
||||
post);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
} else {
|
||||
length_weighted_sum_with_main_input_gradient_kernel<T, IndexType, 32>
|
||||
@ -1810,8 +1796,7 @@ class CUDASparseLengthsIndicesInGradientWeightedSumWithMainInputGradientOp
|
||||
out_weight_grads,
|
||||
prefix_sum_length_data,
|
||||
N,
|
||||
post,
|
||||
len_length);
|
||||
post);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}
|
||||
|
||||
|
@ -5,7 +5,7 @@
|
||||
namespace caffe2 {
|
||||
namespace {
|
||||
__global__ void SliceCopyKernel(
|
||||
char* src_offset_bytes,
|
||||
const char* src_offset_bytes,
|
||||
int src_block_size_bytes,
|
||||
char* dst_offset_bytes,
|
||||
int dst_block_size_bytes,
|
||||
@ -165,8 +165,8 @@ bool SliceImplGpu(
|
||||
|
||||
size_t src_block_size_bytes = itemsize * src_block_size;
|
||||
size_t dst_block_size_bytes = itemsize * dst_block_size;
|
||||
char* src_offset_bytes = src_bytes + itemsize * src_offset;
|
||||
char* dst_offset_bytes = dst_bytes;
|
||||
const char *const src_offset_bytes = src_bytes + itemsize * src_offset;
|
||||
char *const dst_offset_bytes = dst_bytes;
|
||||
|
||||
SliceCopyKernel<<<
|
||||
std::min(num_blocks, CAFFE_MAXIMUM_NUM_BLOCKS),
|
||||
@ -182,25 +182,22 @@ bool SliceImplGpu(
|
||||
num_blocks);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
} else {
|
||||
char* src_bytes = (char*)go->raw_data();
|
||||
char* dst_bytes = (char*)gdata->raw_mutable_data(go->meta());
|
||||
const char *const src_bytes = (char*)go->raw_data();
|
||||
char *const dst_bytes = (char*)gdata->raw_mutable_data(go->meta());
|
||||
|
||||
size_t src_nbytes = go->nbytes();
|
||||
size_t dst_nbytes = gdata->nbytes();
|
||||
|
||||
size_t src_block_size = unit * (ends_idx[dim] - starts_idx[dim]);
|
||||
size_t dst_block_size = unit * data.size(dim);
|
||||
size_t dst_offset = unit * starts_idx[dim];
|
||||
const size_t src_block_size = unit * (ends_idx[dim] - starts_idx[dim]);
|
||||
const size_t dst_block_size = unit * data.size(dim);
|
||||
const size_t dst_offset = unit * starts_idx[dim];
|
||||
|
||||
if (num_blocks == 0 || dst_block_size == 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t src_block_size_bytes = itemsize * src_block_size;
|
||||
size_t dst_block_size_bytes = itemsize * dst_block_size;
|
||||
const size_t src_block_size_bytes = itemsize * src_block_size;
|
||||
const size_t dst_block_size_bytes = itemsize * dst_block_size;
|
||||
|
||||
char* src_offset_bytes = src_bytes;
|
||||
char* dst_offset_bytes = dst_bytes + itemsize * dst_offset;
|
||||
const char *const src_offset_bytes = src_bytes;
|
||||
char *const dst_offset_bytes = dst_bytes + itemsize * dst_offset;
|
||||
// Zero out gradient blob before copy since we copy in fewer items than
|
||||
// there is space for
|
||||
math::Set<float, CUDAContext>(
|
||||
|
@ -231,7 +231,6 @@ __global__ void ComputeXGradientNCHWCUDAKernel(
|
||||
template <typename T>
|
||||
__global__ void ComputeXGradientNHWCCUDAKernel(
|
||||
const int C,
|
||||
const int HxW,
|
||||
const T* dY,
|
||||
const T* X,
|
||||
const T* alpha,
|
||||
@ -462,7 +461,7 @@ void SpatialBNGradientOp<CUDAContext>::ComputeXGradient(
|
||||
<<<dim3(N * HxW, M),
|
||||
CAFFE_CUDA_NUM_THREADS,
|
||||
0,
|
||||
context_.cuda_stream()>>>(C, HxW, dY, X, alpha, beta, gamma, dX);
|
||||
context_.cuda_stream()>>>(C, dY, X, alpha, beta, gamma, dX);
|
||||
C10_CUDA_KERNEL_LAUNCH_CHECK();
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user