Drop unused variables; make things const; use some auto (#71107)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/71107

Test Plan: Sandcastle

Reviewed By: ngimel

Differential Revision: D33490773

fbshipit-source-id: 0d259db9c58c9b33aecc560075f6dcfa78883467
This commit is contained in:
Richard Barnes
2022-01-11 08:54:22 -08:00
committed by Facebook GitHub Bot
parent 3c2ae2b47c
commit cf61738097
8 changed files with 97 additions and 136 deletions

View File

@ -133,9 +133,9 @@ TORCH_API void record_kernel_function_dtype(std::string name);
const auto& SCALAR_TYPE C10_UNUSED_DISPATCH_CUDA_WORKAROUND = enum_type; \
const auto& UNDERLYING_TYPE C10_UNUSED_DISPATCH_CUDA_WORKAROUND = \
toUnderlying(enum_type); \
int bit_width = bitwidth; \
int64_t quant_min = qmin; \
int64_t quant_max = qmax; \
C10_UNUSED int bit_width = bitwidth; \
C10_UNUSED int64_t quant_min = qmin; \
C10_UNUSED int64_t quant_max = qmax; \
(void)bit_width; /* Suppress unused variable warning */ \
(void)quant_min; /* Suppress unused variable warning */ \
(void)quant_max; /* Suppress unused variable warning */ \

View File

@ -6,21 +6,21 @@ namespace caffe2 {
namespace {
template <typename T>
__global__ void LRNFillScaleNCHW(const int nthreads, const T* in,
const int num, const int channels, const int height,
const int channels, const int height,
const int width, const int size, const T alpha_over_size,
const T bias, T* scale) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// find out the local offset
int w = index % width;
int h = (index / width) % height;
int n = index / width / height;
int offset = (n * channels * height + h) * width + w;
int step = height * width;
const int w = index % width;
const int h = (index / width) % height;
const int n = index / width / height;
const int offset = (n * channels * height + h) * width + w;
const int step = height * width;
in += offset;
scale += offset;
int head = 0;
int pre_pad = (size - 1) / 2;
int post_pad = size - pre_pad - 1;
const int pre_pad = (size - 1) / 2;
const int post_pad = size - pre_pad - 1;
T accum_scale = 0;
// fill the scale at [n, :, h, w]
// accumulate values
@ -54,16 +54,16 @@ __global__ void LRNFillScaleNCHW(const int nthreads, const T* in,
}
template <typename T>
__global__ void LRNFillScaleNHWC(const int nthreads, const T* in,
const int num, const int height, const int width,
__global__ void LRNFillScaleNHWC(const int nthreads, const T *const in,
const int height, const int width,
const int channels, const int size, const T alpha_over_size,
const T bias, T* scale) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int c = index % channels;
int pre_pad = (size - 1) / 2;
const int c = index % channels;
const int pre_pad = (size - 1) / 2;
scale[index] = 0;
for (int i = 0; i < size; ++i) {
int raw_idx = c + i - pre_pad;
const int raw_idx = c + i - pre_pad;
if (raw_idx >= 0 && raw_idx < channels) {
scale[index] += in[index + i - pre_pad] * in[index + i - pre_pad];
}
@ -85,17 +85,17 @@ __global__ void LRNComputeOutput(const int nthreads, const T* in,
template <typename T>
__global__ void LRNComputeDiffNCHW(const int nthreads, const T* bottom_data,
const T* top_data, const T* scale, const T* top_diff,
const int num, const int channels, const int height,
const int channels, const int height,
const int width, const int size, const T negative_beta,
const T cache_ratio,
T* bottom_diff) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// find out the local offset
int w = index % width;
int h = (index / width) % height;
int n = index / width / height;
int offset = (n * channels * height + h) * width + w;
int step = height * width;
const int w = index % width;
const int h = (index / width) % height;
const int n = index / width / height;
const int offset = (n * channels * height + h) * width + w;
const int step = height * width;
bottom_data += offset;
top_data += offset;
scale += offset;
@ -155,13 +155,13 @@ __global__ void LRNComputeDiffNCHW(const int nthreads, const T* bottom_data,
template <typename T>
__global__ void LRNComputeDiffNHWC(const int nthreads, const T* bottom_data,
const T* top_data, const T* scale, const T* top_diff,
const int num, const int height, const int width, const int channels,
const int height, const int width, const int channels,
const int size, const T negative_beta, const T cache_ratio,
T* bottom_diff) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// find out the local channel offset
int c = index % channels;
int pre_pad = size / 2;
const int c = index % channels;
const int pre_pad = size / 2;
T accum_ratio = 0;
for (int i = -pre_pad; i < size - pre_pad; ++i) {
if (c + i >= 0 && c + i < channels) {
@ -200,7 +200,7 @@ bool LRNOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
int n_threads = N * H * W;
LRNFillScaleNCHW<float><<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
n_threads, Xdata, N, C, H, W, size_, alpha_ / size_, bias_, scale_data);
n_threads, Xdata, C, H, W, size_, alpha_ / size_, bias_, scale_data);
C10_CUDA_KERNEL_LAUNCH_CHECK();
n_threads = X.numel();
@ -237,7 +237,7 @@ bool LRNOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
int n_threads = X.numel();
LRNFillScaleNHWC<float><<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
n_threads, Xdata, N, H, W, C, size_, alpha_ / size_, bias_, scale_data);
n_threads, Xdata, H, W, C, size_, alpha_ / size_, bias_, scale_data);
C10_CUDA_KERNEL_LAUNCH_CHECK();
LRNComputeOutput<float><<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS,
@ -271,20 +271,20 @@ bool LRNGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
scale_ = &local_scale_tensor_;
}
scale_->ResizeLike(X);
float* scale_data = scale_->template mutable_data<float>();
int n_threads = N * H * W;
float *const scale_data = scale_->template mutable_data<float>();
const int n_threads = N * H * W;
LRNFillScaleNCHW<float><<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
n_threads, Xdata, N, C, H, W, size_, alpha_ / size_, bias_, scale_data);
n_threads, Xdata, C, H, W, size_, alpha_ / size_, bias_, scale_data);
C10_CUDA_KERNEL_LAUNCH_CHECK();
const float* dYdata = dY.data<float>();
float* dXdata = dX->template mutable_data<float>();
const float *const dYdata = dY.data<float>();
float *const dXdata = dX->template mutable_data<float>();
LRNComputeDiffNCHW<float><<<CAFFE_GET_BLOCKS(n_threads),
CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
n_threads, Xdata, Ydata, scale_data, dYdata, N, C, H, W, size_, -beta_,
n_threads, Xdata, Ydata, scale_data, dYdata, C, H, W, size_, -beta_,
2.f * alpha_ * beta_ / size_, dXdata);
C10_CUDA_KERNEL_LAUNCH_CHECK();
@ -317,7 +317,7 @@ bool LRNGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
int n_threads = X.numel();
LRNFillScaleNHWC<float><<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS,
0, context_.cuda_stream()>>>(
n_threads, Xdata, N, H, W, C, size_, alpha_ / size_, bias_, scale_data);
n_threads, Xdata, H, W, C, size_, alpha_ / size_, bias_, scale_data);
C10_CUDA_KERNEL_LAUNCH_CHECK();
LRNComputeDiffNHWC<float>
@ -330,7 +330,6 @@ bool LRNGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
Y.data<float>(),
scale_data,
dY.data<float>(),
X.dim32(0),
X.dim32(1),
X.dim32(2),
X.dim32(3),

View File

@ -19,8 +19,7 @@ using c10::cuda::compat::pow;
template <typename T>
__global__ void LpPoolForwardNCHW(
const int nthreads,
const T* bottom_data,
const int num,
const T *const bottom_data,
const int channels,
const int height,
const int width,
@ -32,7 +31,7 @@ __global__ void LpPoolForwardNCHW(
const int stride_w,
const int pad_t,
const int pad_l,
T* top_data,
T *const top_data,
const T p) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
int n = index;
@ -64,7 +63,6 @@ template <typename T>
__global__ void LpPoolForwardNHWC(
const int nthreads,
const T* bottom_data,
const int num,
const int height,
const int width,
const int channels,
@ -107,7 +105,6 @@ __global__ void LpPoolBackwardNCHW(
const T* const top_diff,
const T* const top_data,
const T* const bottom_data,
const int num,
const int channels,
const int height,
const int width,
@ -143,8 +140,6 @@ __global__ void LpPoolBackwardNCHW(
// figure out the pooling size
int hstart = ph * stride_h - pad_t;
int wstart = pw * stride_w - pad_l;
int hend = min(hstart + kernel_h, height);
int wend = min(wstart + kernel_w, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
gradient += top_diff_slice[ph * pooled_width + pw] *
@ -162,7 +157,6 @@ __global__ void LpPoolBackwardNHWC(
const T* const top_diff,
const T* const top_data,
const T* const bottom_data,
const int num,
const int height,
const int width,
const int channels,
@ -195,12 +189,8 @@ __global__ void LpPoolBackwardNHWC(
for (int ph = phstart; ph < phend; ++ph) {
for (int pw = pwstart; pw < pwend; ++pw) {
// figure out the pooling size
int hstart = ph * stride_h - pad_t;
int wstart = pw * stride_w - pad_l;
int hend = min(hstart + kernel_h, height);
int wend = min(wstart + kernel_w, width);
hstart = max(hstart, 0);
wstart = max(wstart, 0);
const int hstart = max(ph * stride_h - pad_t, 0);
const int wstart = max(pw * stride_w - pad_l, 0);
gradient += top_diff_slice[(ph * pooled_width + pw) * channels] *
bottom_data[index] * pow(abs(bottom_data[index]), p - 2) /
pow(top_data_slice[(ph * pooled_width + pw) * channels], p - 1);
@ -225,7 +215,6 @@ bool PoolOp<float, CUDAContext, LpPoolFunctor>::RunOnDeviceWithOrderNCHW() {
context_.cuda_stream()>>>(
output_size,
X.data<float>(),
X.dim32(0),
X.dim32(1),
X.dim32(2),
X.dim32(3),
@ -257,7 +246,6 @@ bool PoolOp<float, CUDAContext, LpPoolFunctor>::RunOnDeviceWithOrderNHWC() {
context_.cuda_stream()>>>(
output_size,
X.data<float>(),
X.dim32(0),
X.dim32(1),
X.dim32(2),
X.dim32(3),
@ -295,7 +283,6 @@ bool PoolGradientOp<float, CUDAContext, LpPoolFunctor>::
dY.data<float>(),
Y.data<float>(),
X.data<float>(),
X.dim32(0),
X.dim32(1),
X.dim32(2),
X.dim32(3),
@ -333,7 +320,6 @@ bool PoolGradientOp<float, CUDAContext, LpPoolFunctor>::
dY.data<float>(),
Y.data<float>(),
X.data<float>(),
X.dim32(0),
X.dim32(1),
X.dim32(2),
X.dim32(3),

View File

@ -124,16 +124,15 @@ __global__ void PReluWGradientKernelNCHW(
template <typename T>
__global__ void PReluWGradientKernelNHWC(
const int C,
const int N,
const int num_items,
const T* Xdata,
const T* dYdata,
T* dW) {
int c = blockIdx.x;
const auto c = blockIdx.x;
T wsum = 0.0;
int items_per_channel = num_items / C;
const auto items_per_channel = num_items / C;
for (int i = threadIdx.x; i < items_per_channel; i += blockDim.x) {
int ii = i * C + c;
const auto ii = i * C + c;
wsum += (Xdata[ii] <= 0) * dYdata[ii] * Xdata[ii];
}
@ -270,7 +269,7 @@ bool PReluGradientOp<float, CUDAContext>::RunOnDevice() {
C,
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(C, N, X.numel(), Xdata, dYdata, dWdata);
context_.cuda_stream()>>>(C, X.numel(), Xdata, dYdata, dWdata);
C10_CUDA_KERNEL_LAUNCH_CHECK();
PReluGradientKernelNHWC<<<

View File

@ -80,8 +80,6 @@ __global__ void ROIPoolBackward(
const int nthreads,
const T* top_diff,
const int* argmax_data,
const int num_rois,
const T spatial_scale,
const int channels,
const int height,
const int width,
@ -91,10 +89,10 @@ __global__ void ROIPoolBackward(
const T* bottom_rois) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const int pw = index % pooled_width;
const int ph = (index / pooled_width) % pooled_height;
const int c = (index / pooled_width / pooled_height) % channels;
const int n = index / pooled_width / pooled_height / channels;
const T* offset_bottom_rois = bottom_rois + n * 5;
int roi_batch_ind = offset_bottom_rois[0];
@ -184,8 +182,6 @@ C10_EXPORT bool RoIPoolGradientOp<float, CUDAContext>::RunOnDevice() {
dY.numel(),
dY.data<float>(),
A.data<int>(),
R.dim32(0),
spatial_scale_,
X.dim32(1),
X.dim32(2),
X.dim32(3),

View File

@ -49,8 +49,7 @@ __global__ void length_sum_kernel(
T* __restrict__ out,
const int* __restrict__ prefix_sum_length_data,
int N,
int post,
int len_length) {
int post) {
// len_length blocks
int group = blockIdx.x;
@ -177,12 +176,11 @@ C10_LAUNCH_BOUNDS_2(1024, SEGREDUCE_MINBLOCKS)
#endif
__global__ void length_weighted_sum_gradient_kernel(
const T* __restrict__ grad_in,
const T* __restrict__ weights_in,
const T *const __restrict__ weights_in,
T* __restrict__ grad_out,
const int* __restrict__ prefix_sum_length_data,
int N,
int post,
int len_length) {
const int *const __restrict__ prefix_sum_length_data,
const int N,
const int post) {
// len_length blocks
int group = blockIdx.x;
@ -213,21 +211,20 @@ template <typename T, typename IndexType, int NumThreads>
C10_LAUNCH_BOUNDS_2(1024, SEGREDUCE_MINBLOCKS)
#endif
__global__ void length_weighted_sum_with_main_input_gradient_kernel(
const T* __restrict__ grad_in,
const T* __restrict__ weights_in,
const T* __restrict__ data_in,
const IndexType* __restrict__ indices,
T* __restrict__ data_grad_out,
T* __restrict__ weights_grad_out,
const int* __restrict__ prefix_sum_length_data,
int N,
int post,
int len_length) {
const T *const __restrict__ grad_in,
const T *const __restrict__ weights_in,
const T *const __restrict__ data_in,
const IndexType *const __restrict__ indices,
T *const __restrict__ data_grad_out,
T *const __restrict__ weights_grad_out,
const int *const __restrict__ prefix_sum_length_data,
const int N,
const int post) {
// len_length blocks
int group = blockIdx.x;
const int group = blockIdx.x;
int start = group == 0 ? 0 : prefix_sum_length_data[group - 1];
int end = prefix_sum_length_data[group];
const int start = group == 0 ? 0 : prefix_sum_length_data[group - 1];
const int end = prefix_sum_length_data[group];
CUDA_KERNEL_ASSERT(start <= N);
CUDA_KERNEL_ASSERT(end <= N);
@ -257,19 +254,17 @@ C10_LAUNCH_BOUNDS_2(1024, SEGREDUCE_MINBLOCKS)
#endif
__global__ void sparse_length_max_kernel(
const T* __restrict__ in,
T* __restrict__ out,
const int* __restrict__ prefix_sum_length_data,
const IndexType* __restrict__ indices,
int N,
int post,
int len_length,
int len_indices,
T *const __restrict__ out,
const int *const __restrict__ prefix_sum_length_data,
const IndexType *const __restrict__ indices,
const int post,
const int len_indices,
const T numeric_min) {
// len_length blocks
int group = blockIdx.x;
const int group = blockIdx.x;
int start = group == 0 ? 0 : prefix_sum_length_data[group - 1];
int end = prefix_sum_length_data[group];
const int start = group == 0 ? 0 : prefix_sum_length_data[group - 1];
const int end = prefix_sum_length_data[group];
CUDA_KERNEL_ASSERT(start <= len_indices);
CUDA_KERNEL_ASSERT(end <= len_indices);
@ -487,12 +482,12 @@ class CUDASparseLengthsSumOp : public Operator<CUDAContext> {
if (post <= maxThreads) {
length_sum_kernel<T, true, false>
<<<len_length, post, 0, context_.cuda_stream()>>>(
in_data, out_data, prefix_sum_length_data, N, post, len_length);
in_data, out_data, prefix_sum_length_data, N, post);
C10_CUDA_KERNEL_LAUNCH_CHECK();
} else {
length_sum_kernel<T, true, false>
<<<len_length, maxThreads, 0, context_.cuda_stream()>>>(
in_data, out_data, prefix_sum_length_data, N, post, len_length);
in_data, out_data, prefix_sum_length_data, N, post);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
}
@ -625,13 +620,13 @@ class CUDASparseLengthsMeanOp : public Operator<CUDAContext> {
// calling cuda kernel with ExactBlock = true, Average = true
length_sum_kernel<T, true, true>
<<<len_length, post, 0, context_.cuda_stream()>>>(
in_data, out_data, prefix_sum_length_data, N, post, len_length);
in_data, out_data, prefix_sum_length_data, N, post);
C10_CUDA_KERNEL_LAUNCH_CHECK();
} else {
// calling cuda kernel with ExactBlock = true, Average = true
length_sum_kernel<T, true, true>
<<<len_length, maxThreads, 0, context_.cuda_stream()>>>(
in_data, out_data, prefix_sum_length_data, N, post, len_length);
in_data, out_data, prefix_sum_length_data, N, post);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
}
@ -719,9 +714,9 @@ class CUDASparseLengthsMaxOp : public Operator<CUDAContext> {
T numeric_min = std::numeric_limits<T>::min();
if (SparseFused) {
if (post <= maxThreads) {
int multiple = std::min(maxThreads / post, SEGREDUCE_MINBLOCKS);
dim3 block(post, multiple);
size_t smem = sizeof(T) * post * multiple;
const int multiple = std::min(maxThreads / post, SEGREDUCE_MINBLOCKS);
const dim3 block(post, multiple);
const size_t smem = sizeof(T) * post * multiple;
sparse_length_max_kernel<T, IndexType, true>
<<<len_length, block, smem, context_.cuda_stream()>>>(
@ -729,9 +724,7 @@ class CUDASparseLengthsMaxOp : public Operator<CUDAContext> {
out_data,
prefix_sum_length_data,
indices,
N,
post,
len_length,
dataToReduceSize,
numeric_min);
C10_CUDA_KERNEL_LAUNCH_CHECK();
@ -742,9 +735,7 @@ class CUDASparseLengthsMaxOp : public Operator<CUDAContext> {
out_data,
prefix_sum_length_data,
indices,
N,
post,
len_length,
dataToReduceSize,
numeric_min);
C10_CUDA_KERNEL_LAUNCH_CHECK();
@ -1528,8 +1519,7 @@ class CUDASparseLengthsWeightedSumGradientWithIndicesOp
out_data,
prefix_sum_length_data,
N,
post,
len_length);
post);
C10_CUDA_KERNEL_LAUNCH_CHECK();
} else {
length_weighted_sum_gradient_kernel<T, false>
@ -1539,8 +1529,7 @@ class CUDASparseLengthsWeightedSumGradientWithIndicesOp
out_data,
prefix_sum_length_data,
N,
post,
len_length);
post);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
@ -1768,8 +1757,7 @@ class CUDASparseLengthsIndicesInGradientWeightedSumWithMainInputGradientOp
out_weight_grads,
prefix_sum_length_data,
N,
post,
len_length);
post);
C10_CUDA_KERNEL_LAUNCH_CHECK();
} else if (post > 64) {
length_weighted_sum_with_main_input_gradient_kernel<T, IndexType, 128>
@ -1782,8 +1770,7 @@ class CUDASparseLengthsIndicesInGradientWeightedSumWithMainInputGradientOp
out_weight_grads,
prefix_sum_length_data,
N,
post,
len_length);
post);
C10_CUDA_KERNEL_LAUNCH_CHECK();
} else if (post > 32) {
length_weighted_sum_with_main_input_gradient_kernel<T, IndexType, 64>
@ -1796,8 +1783,7 @@ class CUDASparseLengthsIndicesInGradientWeightedSumWithMainInputGradientOp
out_weight_grads,
prefix_sum_length_data,
N,
post,
len_length);
post);
C10_CUDA_KERNEL_LAUNCH_CHECK();
} else {
length_weighted_sum_with_main_input_gradient_kernel<T, IndexType, 32>
@ -1810,8 +1796,7 @@ class CUDASparseLengthsIndicesInGradientWeightedSumWithMainInputGradientOp
out_weight_grads,
prefix_sum_length_data,
N,
post,
len_length);
post);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}

View File

@ -5,7 +5,7 @@
namespace caffe2 {
namespace {
__global__ void SliceCopyKernel(
char* src_offset_bytes,
const char* src_offset_bytes,
int src_block_size_bytes,
char* dst_offset_bytes,
int dst_block_size_bytes,
@ -165,8 +165,8 @@ bool SliceImplGpu(
size_t src_block_size_bytes = itemsize * src_block_size;
size_t dst_block_size_bytes = itemsize * dst_block_size;
char* src_offset_bytes = src_bytes + itemsize * src_offset;
char* dst_offset_bytes = dst_bytes;
const char *const src_offset_bytes = src_bytes + itemsize * src_offset;
char *const dst_offset_bytes = dst_bytes;
SliceCopyKernel<<<
std::min(num_blocks, CAFFE_MAXIMUM_NUM_BLOCKS),
@ -182,25 +182,22 @@ bool SliceImplGpu(
num_blocks);
C10_CUDA_KERNEL_LAUNCH_CHECK();
} else {
char* src_bytes = (char*)go->raw_data();
char* dst_bytes = (char*)gdata->raw_mutable_data(go->meta());
const char *const src_bytes = (char*)go->raw_data();
char *const dst_bytes = (char*)gdata->raw_mutable_data(go->meta());
size_t src_nbytes = go->nbytes();
size_t dst_nbytes = gdata->nbytes();
size_t src_block_size = unit * (ends_idx[dim] - starts_idx[dim]);
size_t dst_block_size = unit * data.size(dim);
size_t dst_offset = unit * starts_idx[dim];
const size_t src_block_size = unit * (ends_idx[dim] - starts_idx[dim]);
const size_t dst_block_size = unit * data.size(dim);
const size_t dst_offset = unit * starts_idx[dim];
if (num_blocks == 0 || dst_block_size == 0) {
return true;
}
size_t src_block_size_bytes = itemsize * src_block_size;
size_t dst_block_size_bytes = itemsize * dst_block_size;
const size_t src_block_size_bytes = itemsize * src_block_size;
const size_t dst_block_size_bytes = itemsize * dst_block_size;
char* src_offset_bytes = src_bytes;
char* dst_offset_bytes = dst_bytes + itemsize * dst_offset;
const char *const src_offset_bytes = src_bytes;
char *const dst_offset_bytes = dst_bytes + itemsize * dst_offset;
// Zero out gradient blob before copy since we copy in fewer items than
// there is space for
math::Set<float, CUDAContext>(

View File

@ -231,7 +231,6 @@ __global__ void ComputeXGradientNCHWCUDAKernel(
template <typename T>
__global__ void ComputeXGradientNHWCCUDAKernel(
const int C,
const int HxW,
const T* dY,
const T* X,
const T* alpha,
@ -462,7 +461,7 @@ void SpatialBNGradientOp<CUDAContext>::ComputeXGradient(
<<<dim3(N * HxW, M),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(C, HxW, dY, X, alpha, beta, gamma, dX);
context_.cuda_stream()>>>(C, dY, X, alpha, beta, gamma, dX);
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
}