Drop unused variables; make things const; use some auto (#71107)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/71107 Test Plan: Sandcastle Reviewed By: ngimel Differential Revision: D33490773 fbshipit-source-id: 0d259db9c58c9b33aecc560075f6dcfa78883467
2025-10-20 21:14:14 +08:00 · 2022-01-11 08:54:22 -08:00
parent 3c2ae2b47c
commit cf61738097
8 changed files with 97 additions and 136 deletions
--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@ -133,9 +133,9 @@ TORCH_API void record_kernel_function_dtype(std::string name);
    const auto& SCALAR_TYPE C10_UNUSED_DISPATCH_CUDA_WORKAROUND = enum_type;      \
    const auto& UNDERLYING_TYPE C10_UNUSED_DISPATCH_CUDA_WORKAROUND =             \
        toUnderlying(enum_type);                                                  \
-    int bit_width = bitwidth;                                                     \
-    int64_t quant_min = qmin;                                                     \
-    int64_t quant_max = qmax;                                                     \
+    C10_UNUSED int bit_width = bitwidth;                                          \
+    C10_UNUSED int64_t quant_min = qmin;                                          \
+    C10_UNUSED int64_t quant_max = qmax;                                          \
    (void)bit_width; /* Suppress unused variable warning */                       \
    (void)quant_min; /* Suppress unused variable warning */                       \
    (void)quant_max; /* Suppress unused variable warning */                       \
--- a/caffe2/operators/local_response_normalization_op.cu
+++ b/caffe2/operators/local_response_normalization_op.cu
@ -6,21 +6,21 @@ namespace caffe2 {
 namespace {
 template <typename T>
 __global__ void LRNFillScaleNCHW(const int nthreads, const T* in,
-    const int num, const int channels, const int height,
+    const int channels, const int height,
    const int width, const int size, const T alpha_over_size,
    const T bias, T* scale) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // find out the local offset
-    int w = index % width;
-    int h = (index / width) % height;
-    int n = index / width / height;
-    int offset = (n * channels * height + h) * width + w;
-    int step = height * width;
+    const int w = index % width;
+    const int h = (index / width) % height;
+    const int n = index / width / height;
+    const int offset = (n * channels * height + h) * width + w;
+    const int step = height * width;
    in += offset;
    scale += offset;
    int head = 0;
-    int pre_pad = (size - 1) / 2;
-    int post_pad = size - pre_pad - 1;
+    const int pre_pad = (size - 1) / 2;
+    const int post_pad = size - pre_pad - 1;
    T accum_scale = 0;
    // fill the scale at [n, :, h, w]
    // accumulate values
@ -54,16 +54,16 @@ __global__ void LRNFillScaleNCHW(const int nthreads, const T* in,
 }

 template <typename T>
-__global__ void LRNFillScaleNHWC(const int nthreads, const T* in,
-    const int num, const int height, const int width,
+__global__ void LRNFillScaleNHWC(const int nthreads, const T *const in,
+    const int height, const int width,
    const int channels, const int size, const T alpha_over_size,
    const T bias, T* scale) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    int c = index % channels;
-    int pre_pad = (size - 1) / 2;
+    const int c = index % channels;
+    const int pre_pad = (size - 1) / 2;
    scale[index] = 0;
    for (int i = 0; i < size; ++i) {
-      int raw_idx = c + i - pre_pad;
+      const int raw_idx = c + i - pre_pad;
      if (raw_idx >= 0 && raw_idx < channels) {
        scale[index] += in[index + i - pre_pad] * in[index + i - pre_pad];
      }
@ -85,17 +85,17 @@ __global__ void LRNComputeOutput(const int nthreads, const T* in,
 template <typename T>
 __global__ void LRNComputeDiffNCHW(const int nthreads, const T* bottom_data,
    const T* top_data, const T* scale, const T* top_diff,
-    const int num, const int channels, const int height,
+    const int channels, const int height,
    const int width, const int size, const T negative_beta,
    const T cache_ratio,
    T* bottom_diff) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // find out the local offset
-    int w = index % width;
-    int h = (index / width) % height;
-    int n = index / width / height;
-    int offset = (n * channels * height + h) * width + w;
-    int step = height * width;
+    const int w = index % width;
+    const int h = (index / width) % height;
+    const int n = index / width / height;
+    const int offset = (n * channels * height + h) * width + w;
+    const int step = height * width;
    bottom_data += offset;
    top_data += offset;
    scale += offset;
@ -155,13 +155,13 @@ __global__ void LRNComputeDiffNCHW(const int nthreads, const T* bottom_data,
 template <typename T>
 __global__ void LRNComputeDiffNHWC(const int nthreads, const T* bottom_data,
    const T* top_data, const T* scale, const T* top_diff,
-    const int num, const int height, const int width, const int channels,
+    const int height, const int width, const int channels,
    const int size, const T negative_beta, const T cache_ratio,
    T* bottom_diff) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // find out the local channel offset
-    int c = index % channels;
-    int pre_pad = size / 2;
+    const int c = index % channels;
+    const int pre_pad = size / 2;
    T accum_ratio = 0;
    for (int i = -pre_pad; i < size - pre_pad; ++i) {
      if (c + i >= 0 && c + i < channels) {
@ -200,7 +200,7 @@ bool LRNOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
  int n_threads = N * H * W;
  LRNFillScaleNCHW<float><<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS,
                        0, context_.cuda_stream()>>>(
-      n_threads, Xdata, N, C, H, W, size_, alpha_ / size_, bias_, scale_data);
+      n_threads, Xdata, C, H, W, size_, alpha_ / size_, bias_, scale_data);
  C10_CUDA_KERNEL_LAUNCH_CHECK();

  n_threads = X.numel();
@ -237,7 +237,7 @@ bool LRNOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
  int n_threads = X.numel();
  LRNFillScaleNHWC<float><<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS,
                        0, context_.cuda_stream()>>>(
-      n_threads, Xdata, N, H, W, C, size_, alpha_ / size_, bias_, scale_data);
+      n_threads, Xdata, H, W, C, size_, alpha_ / size_, bias_, scale_data);
  C10_CUDA_KERNEL_LAUNCH_CHECK();

  LRNComputeOutput<float><<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS,
@ -271,20 +271,20 @@ bool LRNGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
    scale_ = &local_scale_tensor_;
  }
  scale_->ResizeLike(X);
-  float* scale_data = scale_->template mutable_data<float>();
-  int n_threads = N * H * W;
+  float *const scale_data = scale_->template mutable_data<float>();
+  const int n_threads = N * H * W;
  LRNFillScaleNCHW<float><<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS,
                        0, context_.cuda_stream()>>>(
-      n_threads, Xdata, N, C, H, W, size_, alpha_ / size_, bias_, scale_data);
+      n_threads, Xdata, C, H, W, size_, alpha_ / size_, bias_, scale_data);
  C10_CUDA_KERNEL_LAUNCH_CHECK();

-  const float* dYdata = dY.data<float>();
-  float* dXdata = dX->template mutable_data<float>();
+  const float *const dYdata = dY.data<float>();
+  float *const dXdata = dX->template mutable_data<float>();

  LRNComputeDiffNCHW<float><<<CAFFE_GET_BLOCKS(n_threads),
                              CAFFE_CUDA_NUM_THREADS,
                              0, context_.cuda_stream()>>>(
-      n_threads, Xdata, Ydata, scale_data, dYdata, N, C, H, W, size_, -beta_,
+      n_threads, Xdata, Ydata, scale_data, dYdata, C, H, W, size_, -beta_,
      2.f * alpha_ * beta_ / size_, dXdata);
  C10_CUDA_KERNEL_LAUNCH_CHECK();

@ -317,7 +317,7 @@ bool LRNGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
  int n_threads = X.numel();
  LRNFillScaleNHWC<float><<<CAFFE_GET_BLOCKS(n_threads), CAFFE_CUDA_NUM_THREADS,
                        0, context_.cuda_stream()>>>(
-      n_threads, Xdata, N, H, W, C, size_, alpha_ / size_, bias_, scale_data);
+      n_threads, Xdata, H, W, C, size_, alpha_ / size_, bias_, scale_data);
  C10_CUDA_KERNEL_LAUNCH_CHECK();

  LRNComputeDiffNHWC<float>
@ -330,7 +330,6 @@ bool LRNGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNHWC() {
          Y.data<float>(),
          scale_data,
          dY.data<float>(),
-          X.dim32(0),
          X.dim32(1),
          X.dim32(2),
          X.dim32(3),
--- a/caffe2/operators/lp_pool_op.cu
+++ b/caffe2/operators/lp_pool_op.cu
@ -19,8 +19,7 @@ using c10::cuda::compat::pow;
 template <typename T>
 __global__ void LpPoolForwardNCHW(
    const int nthreads,
-    const T* bottom_data,
-    const int num,
+    const T *const bottom_data,
    const int channels,
    const int height,
    const int width,
@ -32,7 +31,7 @@ __global__ void LpPoolForwardNCHW(
    const int stride_w,
    const int pad_t,
    const int pad_l,
-    T* top_data,
+    T *const top_data,
    const T p) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    int n = index;
@ -64,7 +63,6 @@ template <typename T>
 __global__ void LpPoolForwardNHWC(
    const int nthreads,
    const T* bottom_data,
-    const int num,
    const int height,
    const int width,
    const int channels,
@ -107,7 +105,6 @@ __global__ void LpPoolBackwardNCHW(
    const T* const top_diff,
    const T* const top_data,
    const T* const bottom_data,
-    const int num,
    const int channels,
    const int height,
    const int width,
@ -143,8 +140,6 @@ __global__ void LpPoolBackwardNCHW(
        // figure out the pooling size
        int hstart = ph * stride_h - pad_t;
        int wstart = pw * stride_w - pad_l;
-        int hend = min(hstart + kernel_h, height);
-        int wend = min(wstart + kernel_w, width);
        hstart = max(hstart, 0);
        wstart = max(wstart, 0);
        gradient += top_diff_slice[ph * pooled_width + pw] *
@ -162,7 +157,6 @@ __global__ void LpPoolBackwardNHWC(
    const T* const top_diff,
    const T* const top_data,
    const T* const bottom_data,
-    const int num,
    const int height,
    const int width,
    const int channels,
@ -195,12 +189,8 @@ __global__ void LpPoolBackwardNHWC(
    for (int ph = phstart; ph < phend; ++ph) {
      for (int pw = pwstart; pw < pwend; ++pw) {
        // figure out the pooling size
-        int hstart = ph * stride_h - pad_t;
-        int wstart = pw * stride_w - pad_l;
-        int hend = min(hstart + kernel_h, height);
-        int wend = min(wstart + kernel_w, width);
-        hstart = max(hstart, 0);
-        wstart = max(wstart, 0);
+        const int hstart = max(ph * stride_h - pad_t, 0);
+        const int wstart = max(pw * stride_w - pad_l, 0);
        gradient += top_diff_slice[(ph * pooled_width + pw) * channels] *
            bottom_data[index] * pow(abs(bottom_data[index]), p - 2) /
            pow(top_data_slice[(ph * pooled_width + pw) * channels], p - 1);
@ -225,7 +215,6 @@ bool PoolOp<float, CUDAContext, LpPoolFunctor>::RunOnDeviceWithOrderNCHW() {
         context_.cuda_stream()>>>(
          output_size,
          X.data<float>(),
-          X.dim32(0),
          X.dim32(1),
          X.dim32(2),
          X.dim32(3),
@ -257,7 +246,6 @@ bool PoolOp<float, CUDAContext, LpPoolFunctor>::RunOnDeviceWithOrderNHWC() {
         context_.cuda_stream()>>>(
          output_size,
          X.data<float>(),
-          X.dim32(0),
          X.dim32(1),
          X.dim32(2),
          X.dim32(3),
@ -295,7 +283,6 @@ bool PoolGradientOp<float, CUDAContext, LpPoolFunctor>::
          dY.data<float>(),
          Y.data<float>(),
          X.data<float>(),
-          X.dim32(0),
          X.dim32(1),
          X.dim32(2),
          X.dim32(3),
@ -333,7 +320,6 @@ bool PoolGradientOp<float, CUDAContext, LpPoolFunctor>::
          dY.data<float>(),
          Y.data<float>(),
          X.data<float>(),
-          X.dim32(0),
          X.dim32(1),
          X.dim32(2),
          X.dim32(3),
--- a/caffe2/operators/prelu_op.cu
+++ b/caffe2/operators/prelu_op.cu
@ -124,16 +124,15 @@ __global__ void PReluWGradientKernelNCHW(
 template <typename T>
 __global__ void PReluWGradientKernelNHWC(
    const int C,
-    const int N,
    const int num_items,
    const T* Xdata,
    const T* dYdata,
    T* dW) {
-  int c = blockIdx.x;
+  const auto c = blockIdx.x;
  T wsum = 0.0;
-  int items_per_channel = num_items / C;
+  const auto items_per_channel = num_items / C;
  for (int i = threadIdx.x; i < items_per_channel; i += blockDim.x) {
-    int ii = i * C + c;
+    const auto ii = i * C + c;
    wsum += (Xdata[ii] <= 0) * dYdata[ii] * Xdata[ii];
  }

@ -270,7 +269,7 @@ bool PReluGradientOp<float, CUDAContext>::RunOnDevice() {
          C,
          CAFFE_CUDA_NUM_THREADS,
          0,
-          context_.cuda_stream()>>>(C, N, X.numel(), Xdata, dYdata, dWdata);
+          context_.cuda_stream()>>>(C, X.numel(), Xdata, dYdata, dWdata);
      C10_CUDA_KERNEL_LAUNCH_CHECK();

      PReluGradientKernelNHWC<<<
--- a/caffe2/operators/roi_pool_op.cu
+++ b/caffe2/operators/roi_pool_op.cu
@ -80,8 +80,6 @@ __global__ void ROIPoolBackward(
    const int nthreads,
    const T* top_diff,
    const int* argmax_data,
-    const int num_rois,
-    const T spatial_scale,
    const int channels,
    const int height,
    const int width,
@ -91,10 +89,10 @@ __global__ void ROIPoolBackward(
    const T* bottom_rois) {
  CUDA_1D_KERNEL_LOOP(index, nthreads) {
    // (n, c, ph, pw) is an element in the pooled output
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int c = (index / pooled_width / pooled_height) % channels;
+    const int n = index / pooled_width / pooled_height / channels;

    const T* offset_bottom_rois = bottom_rois + n * 5;
    int roi_batch_ind = offset_bottom_rois[0];
@ -184,8 +182,6 @@ C10_EXPORT bool RoIPoolGradientOp<float, CUDAContext>::RunOnDevice() {
            dY.numel(),
            dY.data<float>(),
            A.data<int>(),
-            R.dim32(0),
-            spatial_scale_,
            X.dim32(1),
            X.dim32(2),
            X.dim32(3),
--- a/caffe2/operators/segment_reduction_op_gpu.cu
+++ b/caffe2/operators/segment_reduction_op_gpu.cu
@ -49,8 +49,7 @@ __global__ void length_sum_kernel(
    T* __restrict__ out,
    const int* __restrict__ prefix_sum_length_data,
    int N,
-    int post,
-    int len_length) {
+    int post) {
  // len_length blocks
  int group = blockIdx.x;

@ -177,12 +176,11 @@ C10_LAUNCH_BOUNDS_2(1024, SEGREDUCE_MINBLOCKS)
 #endif
 __global__ void length_weighted_sum_gradient_kernel(
    const T* __restrict__ grad_in,
-    const T* __restrict__ weights_in,
+    const T *const __restrict__ weights_in,
    T* __restrict__ grad_out,
-    const int* __restrict__ prefix_sum_length_data,
-    int N,
-    int post,
-    int len_length) {
+    const int *const __restrict__ prefix_sum_length_data,
+    const int N,
+    const int post) {
  // len_length blocks
  int group = blockIdx.x;

@ -213,21 +211,20 @@ template <typename T, typename IndexType, int NumThreads>
 C10_LAUNCH_BOUNDS_2(1024, SEGREDUCE_MINBLOCKS)
 #endif
 __global__ void length_weighted_sum_with_main_input_gradient_kernel(
-    const T* __restrict__ grad_in,
-    const T* __restrict__ weights_in,
-    const T* __restrict__ data_in,
-    const IndexType* __restrict__ indices,
-    T* __restrict__ data_grad_out,
-    T* __restrict__ weights_grad_out,
-    const int* __restrict__ prefix_sum_length_data,
-    int N,
-    int post,
-    int len_length) {
+    const T *const __restrict__ grad_in,
+    const T *const __restrict__ weights_in,
+    const T *const __restrict__ data_in,
+    const IndexType *const __restrict__ indices,
+    T *const __restrict__ data_grad_out,
+    T *const __restrict__ weights_grad_out,
+    const int *const __restrict__ prefix_sum_length_data,
+    const int N,
+    const int post) {
  // len_length blocks
-  int group = blockIdx.x;
+  const int group = blockIdx.x;

-  int start = group == 0 ? 0 : prefix_sum_length_data[group - 1];
-  int end = prefix_sum_length_data[group];
+  const int start = group == 0 ? 0 : prefix_sum_length_data[group - 1];
+  const int end = prefix_sum_length_data[group];
  CUDA_KERNEL_ASSERT(start <= N);
  CUDA_KERNEL_ASSERT(end <= N);

@ -257,19 +254,17 @@ C10_LAUNCH_BOUNDS_2(1024, SEGREDUCE_MINBLOCKS)
 #endif
 __global__ void sparse_length_max_kernel(
    const T* __restrict__ in,
-    T* __restrict__ out,
-    const int* __restrict__ prefix_sum_length_data,
-    const IndexType* __restrict__ indices,
-    int N,
-    int post,
-    int len_length,
-    int len_indices,
+    T *const __restrict__ out,
+    const int *const __restrict__ prefix_sum_length_data,
+    const IndexType *const __restrict__ indices,
+    const int post,
+    const int len_indices,
    const T numeric_min) {
  // len_length blocks
-  int group = blockIdx.x;
+  const int group = blockIdx.x;

-  int start = group == 0 ? 0 : prefix_sum_length_data[group - 1];
-  int end = prefix_sum_length_data[group];
+  const int start = group == 0 ? 0 : prefix_sum_length_data[group - 1];
+  const int end = prefix_sum_length_data[group];
  CUDA_KERNEL_ASSERT(start <= len_indices);
  CUDA_KERNEL_ASSERT(end <= len_indices);

@ -487,12 +482,12 @@ class CUDASparseLengthsSumOp : public Operator<CUDAContext> {
      if (post <= maxThreads) {
        length_sum_kernel<T, true, false>
            <<<len_length, post, 0, context_.cuda_stream()>>>(
-                in_data, out_data, prefix_sum_length_data, N, post, len_length);
+                in_data, out_data, prefix_sum_length_data, N, post);
        C10_CUDA_KERNEL_LAUNCH_CHECK();
      } else {
        length_sum_kernel<T, true, false>
            <<<len_length, maxThreads, 0, context_.cuda_stream()>>>(
-                in_data, out_data, prefix_sum_length_data, N, post, len_length);
+                in_data, out_data, prefix_sum_length_data, N, post);
        C10_CUDA_KERNEL_LAUNCH_CHECK();
      }
    }
@ -625,13 +620,13 @@ class CUDASparseLengthsMeanOp : public Operator<CUDAContext> {
        // calling cuda kernel with ExactBlock = true, Average = true
        length_sum_kernel<T, true, true>
            <<<len_length, post, 0, context_.cuda_stream()>>>(
-                in_data, out_data, prefix_sum_length_data, N, post, len_length);
+                in_data, out_data, prefix_sum_length_data, N, post);
        C10_CUDA_KERNEL_LAUNCH_CHECK();
      } else {
        // calling cuda kernel with ExactBlock = true, Average = true
        length_sum_kernel<T, true, true>
            <<<len_length, maxThreads, 0, context_.cuda_stream()>>>(
-                in_data, out_data, prefix_sum_length_data, N, post, len_length);
+                in_data, out_data, prefix_sum_length_data, N, post);
        C10_CUDA_KERNEL_LAUNCH_CHECK();
      }
    }
@ -719,9 +714,9 @@ class CUDASparseLengthsMaxOp : public Operator<CUDAContext> {
    T numeric_min = std::numeric_limits<T>::min();
    if (SparseFused) {
      if (post <= maxThreads) {
-        int multiple = std::min(maxThreads / post, SEGREDUCE_MINBLOCKS);
-        dim3 block(post, multiple);
-        size_t smem = sizeof(T) * post * multiple;
+        const int multiple = std::min(maxThreads / post, SEGREDUCE_MINBLOCKS);
+        const dim3 block(post, multiple);
+        const size_t smem = sizeof(T) * post * multiple;

        sparse_length_max_kernel<T, IndexType, true>
            <<<len_length, block, smem, context_.cuda_stream()>>>(
@ -729,9 +724,7 @@ class CUDASparseLengthsMaxOp : public Operator<CUDAContext> {
                out_data,
                prefix_sum_length_data,
                indices,
-                N,
                post,
-                len_length,
                dataToReduceSize,
                numeric_min);
        C10_CUDA_KERNEL_LAUNCH_CHECK();
@ -742,9 +735,7 @@ class CUDASparseLengthsMaxOp : public Operator<CUDAContext> {
                out_data,
                prefix_sum_length_data,
                indices,
-                N,
                post,
-                len_length,
                dataToReduceSize,
                numeric_min);
        C10_CUDA_KERNEL_LAUNCH_CHECK();
@ -1528,8 +1519,7 @@ class CUDASparseLengthsWeightedSumGradientWithIndicesOp
              out_data,
              prefix_sum_length_data,
              N,
-              post,
-              len_length);
+              post);
      C10_CUDA_KERNEL_LAUNCH_CHECK();
    } else {
      length_weighted_sum_gradient_kernel<T, false>
@ -1539,8 +1529,7 @@ class CUDASparseLengthsWeightedSumGradientWithIndicesOp
              out_data,
              prefix_sum_length_data,
              N,
-              post,
-              len_length);
+              post);
      C10_CUDA_KERNEL_LAUNCH_CHECK();
    }

@ -1768,8 +1757,7 @@ class CUDASparseLengthsIndicesInGradientWeightedSumWithMainInputGradientOp
              out_weight_grads,
              prefix_sum_length_data,
              N,
-              post,
-              len_length);
+              post);
      C10_CUDA_KERNEL_LAUNCH_CHECK();
    } else if (post > 64) {
      length_weighted_sum_with_main_input_gradient_kernel<T, IndexType, 128>
@ -1782,8 +1770,7 @@ class CUDASparseLengthsIndicesInGradientWeightedSumWithMainInputGradientOp
              out_weight_grads,
              prefix_sum_length_data,
              N,
-              post,
-              len_length);
+              post);
      C10_CUDA_KERNEL_LAUNCH_CHECK();
    } else if (post > 32) {
      length_weighted_sum_with_main_input_gradient_kernel<T, IndexType, 64>
@ -1796,8 +1783,7 @@ class CUDASparseLengthsIndicesInGradientWeightedSumWithMainInputGradientOp
              out_weight_grads,
              prefix_sum_length_data,
              N,
-              post,
-              len_length);
+              post);
      C10_CUDA_KERNEL_LAUNCH_CHECK();
    } else {
      length_weighted_sum_with_main_input_gradient_kernel<T, IndexType, 32>
@ -1810,8 +1796,7 @@ class CUDASparseLengthsIndicesInGradientWeightedSumWithMainInputGradientOp
              out_weight_grads,
              prefix_sum_length_data,
              N,
-              post,
-              len_length);
+              post);
      C10_CUDA_KERNEL_LAUNCH_CHECK();
    }

--- a/caffe2/operators/slice_op.cu
+++ b/caffe2/operators/slice_op.cu
@ -5,7 +5,7 @@
 namespace caffe2 {
 namespace {
 __global__ void SliceCopyKernel(
-    char* src_offset_bytes,
+    const char* src_offset_bytes,
    int src_block_size_bytes,
    char* dst_offset_bytes,
    int dst_block_size_bytes,
@ -165,8 +165,8 @@ bool SliceImplGpu(

    size_t src_block_size_bytes = itemsize * src_block_size;
    size_t dst_block_size_bytes = itemsize * dst_block_size;
-    char* src_offset_bytes = src_bytes + itemsize * src_offset;
-    char* dst_offset_bytes = dst_bytes;
+    const char *const src_offset_bytes = src_bytes + itemsize * src_offset;
+    char *const dst_offset_bytes = dst_bytes;

    SliceCopyKernel<<<
        std::min(num_blocks, CAFFE_MAXIMUM_NUM_BLOCKS),
@ -182,25 +182,22 @@ bool SliceImplGpu(
        num_blocks);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
  } else {
-    char* src_bytes = (char*)go->raw_data();
-    char* dst_bytes = (char*)gdata->raw_mutable_data(go->meta());
+    const char *const src_bytes = (char*)go->raw_data();
+    char *const dst_bytes = (char*)gdata->raw_mutable_data(go->meta());

-    size_t src_nbytes = go->nbytes();
-    size_t dst_nbytes = gdata->nbytes();
-
-    size_t src_block_size = unit * (ends_idx[dim] - starts_idx[dim]);
-    size_t dst_block_size = unit * data.size(dim);
-    size_t dst_offset = unit * starts_idx[dim];
+    const size_t src_block_size = unit * (ends_idx[dim] - starts_idx[dim]);
+    const size_t dst_block_size = unit * data.size(dim);
+    const size_t dst_offset = unit * starts_idx[dim];

    if (num_blocks == 0 || dst_block_size == 0) {
      return true;
    }

-    size_t src_block_size_bytes = itemsize * src_block_size;
-    size_t dst_block_size_bytes = itemsize * dst_block_size;
+    const size_t src_block_size_bytes = itemsize * src_block_size;
+    const size_t dst_block_size_bytes = itemsize * dst_block_size;

-    char* src_offset_bytes = src_bytes;
-    char* dst_offset_bytes = dst_bytes + itemsize * dst_offset;
+    const char *const src_offset_bytes = src_bytes;
+    char *const dst_offset_bytes = dst_bytes + itemsize * dst_offset;
    // Zero out gradient blob before copy since we copy in fewer items than
    // there is space for
    math::Set<float, CUDAContext>(
--- a/caffe2/operators/spatial_batch_norm_op_impl.cuh
+++ b/caffe2/operators/spatial_batch_norm_op_impl.cuh
@ -231,7 +231,6 @@ __global__ void ComputeXGradientNCHWCUDAKernel(
 template <typename T>
 __global__ void ComputeXGradientNHWCCUDAKernel(
    const int C,
-    const int HxW,
    const T* dY,
    const T* X,
    const T* alpha,
@ -462,7 +461,7 @@ void SpatialBNGradientOp<CUDAContext>::ComputeXGradient(
        <<<dim3(N * HxW, M),
           CAFFE_CUDA_NUM_THREADS,
           0,
-           context_.cuda_stream()>>>(C, HxW, dY, X, alpha, beta, gamma, dX);
+           context_.cuda_stream()>>>(C, dY, X, alpha, beta, gamma, dX);
    C10_CUDA_KERNEL_LAUNCH_CHECK();
  }
 }