Convert all tabs to spaces, add CI. (#18959)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/18959 ghimport-source-id: a934163fa34cb2019732d5f49dc7290c376bf156 Differential Revision: D14831246 Pulled By: ezyang fbshipit-source-id: beb92dc4ee8c82f4c8259c081dd72e477fe7a9d0
2025-10-20 12:54:11 +08:00 · 2019-04-09 08:02:30 -07:00
parent 544783fa1d
commit 48a35135fb
80 changed files with 1509 additions and 1505 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -16,6 +16,10 @@ matrix:
        python: "3.6"
        dist: xenial
        script: cd .circleci && ./ensure-consistency.py
+      - name: "Ensure no tabs"
+        python: "2.7"
+        script:
+          - (! git grep -I -l $'\t' -- . ':(exclude)*.svg' ':(exclude)**Makefile' ':(exclude)**/contrib/**' ':(exclude)third_party' ':(exclude).gitattributes' ':(exclude).gitmodules' || (echo "The above files have tabs; please convert them to spaces"; false))
      - name: "Python 2.7 Lint"
        python: "2.7"
        install: pip install flake8
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -252,21 +252,21 @@ IF(USE_CUDA AND NOT USE_ROCM)
    EXECUTE_PROCESS(COMMAND touch ${CMAKE_CURRENT_BINARY_DIR}/empty_file.cc)
    if(${CUDA_VERSION_MAJOR} EQUAL "8")
      SET(CUFFT_FAKELINK_OPTIONS
-	--generate-code arch=compute_35,code=sm_35
-	--generate-code arch=compute_50,code=sm_50
-	--generate-code arch=compute_60,code=sm_60)
+        --generate-code arch=compute_35,code=sm_35
+        --generate-code arch=compute_50,code=sm_50
+        --generate-code arch=compute_60,code=sm_60)
    elseif(${CUDA_VERSION_MAJOR} EQUAL "9")
      SET(CUFFT_FAKELINK_OPTIONS
-	--generate-code arch=compute_35,code=sm_35
-	--generate-code arch=compute_50,code=sm_50
-	--generate-code arch=compute_60,code=sm_60
-	--generate-code arch=compute_70,code=sm_70)
+        --generate-code arch=compute_35,code=sm_35
+        --generate-code arch=compute_50,code=sm_50
+        --generate-code arch=compute_60,code=sm_60
+        --generate-code arch=compute_70,code=sm_70)
    elseif(${CUDA_VERSION_MAJOR} EQUAL "10")
      SET(CUFFT_FAKELINK_OPTIONS
-	--generate-code arch=compute_35,code=sm_35
-	--generate-code arch=compute_50,code=sm_50
-	--generate-code arch=compute_60,code=sm_60
-	--generate-code arch=compute_70,code=sm_70)
+        --generate-code arch=compute_35,code=sm_35
+        --generate-code arch=compute_50,code=sm_50
+        --generate-code arch=compute_60,code=sm_60
+        --generate-code arch=compute_70,code=sm_70)
    else()
      MESSAGE(FATAL_ERROR "Unhandled major cuda version ${CUDA_VERSION_MAJOR}")
    endif()
--- a/aten/src/ATen/cpu/vec256/intrinsics.h
+++ b/aten/src/ATen/cpu/vec256/intrinsics.h
@ -19,7 +19,7 @@
 /* GCC-compatible compiler, targeting ARM with WMMX */
 #include <mmintrin.h>
 #elif (defined(__GNUC__) || defined(__xlC__)) &&                               \
-	(defined(__VEC__) || defined(__ALTIVEC__))
+        (defined(__VEC__) || defined(__ALTIVEC__))
 /* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */
 #include <altivec.h>
 #elif defined(__GNUC__) && defined(__SPE__)
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@ -46,12 +46,12 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
    auto sr = right.size(i)>1;
    if (sum_dims[i]) { // first dimensions that will be summed over after multiplication
      if (sl && sr) {  // dimensions nontrivially in both left and right must be of the same size
-	AT_CHECK(left.size(i)==right.size(i), "non-broadcast dimensions must match");
-	sum_size *= left.size(i);
+        AT_CHECK(left.size(i)==right.size(i), "non-broadcast dimensions must match");
+        sum_size *= left.size(i);
      } else if (sl) { // if it is only in one of left and right, we can sum right away
-	left = left.sum(i, true);
+        left = left.sum(i, true);
      } else if (sr) {
-	right = right.sum(i, true);
+        right = right.sum(i, true);
      }
    } else if (sl && sr) { // now deal with dimensions  dimensions that will be in the output
      // dimensions nontrivially in both left and right must be of the same size
@ -117,7 +117,7 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
  if (! keepdim) {
    for (int i = dim-1; i>=0; i--)
      if (sum_dims[i])
-	result.squeeze_(i);
+        result.squeeze_(i);
  }
  return result;
 }
@ -183,7 +183,7 @@ Tensor einsum(std::string eqn, TensorList tensors) {
          }
          else {                          // we have seen an ellipsis before, so we check compatibility
            AT_CHECK(candidate_num_ell_idxes == num_ell_idxes,
-		     "ellipsis must represent ", num_ell_idxes, " dimensions in all terms");
+                     "ellipsis must represent ", num_ell_idxes, " dimensions in all terms");
          }
          for (int64_t i = 0; i < num_ell_idxes; ++i) { // map ellipsis dimensions in operand to indices
            current_op_idxes.push_back(first_ell_idx + i);
@ -360,8 +360,8 @@ Tensor einsum(std::string eqn, TensorList tensors) {
 // the computation is unrolled in the unroll_dim dimension
 // its main purpose is to unify the computations in bilinear and bilinear_backward
 Tensor _trilinear(const Tensor& i1_, const Tensor& i2_, const Tensor& i3_,
-		  IntArrayRef expand1_, IntArrayRef expand2_, IntArrayRef expand3_,
-		  IntArrayRef sumdim_, int64_t unroll_dim) {
+                  IntArrayRef expand1_, IntArrayRef expand2_, IntArrayRef expand3_,
+                  IntArrayRef sumdim_, int64_t unroll_dim) {
  int64_t total_dim = i1_.dim()+expand1_.size();
  AT_CHECK((unroll_dim >= 0) && (unroll_dim < total_dim), "unroll_dim must be in [0,", total_dim-1, "]");
  auto expand1 = at::dim_list_to_bitset(expand1_, total_dim);
@ -390,11 +390,11 @@ Tensor _trilinear(const Tensor& i1_, const Tensor& i2_, const Tensor& i3_,
    if (expand3[i]) {
      i3 = i3.unsqueeze(i);
      if (sumdim[i] && (i != unroll_dim))
-	sum_dims_12.push_back(i);
+        sum_dims_12.push_back(i);
    } else  {
      s = i3.size(i);
      if (sumdim[i] && (i != unroll_dim))
-	sum_dims_23.push_back(i);
+        sum_dims_23.push_back(i);
    }
    output_size.push_back(sumdim[i] ? 1 : s);
    if (i == unroll_dim)
@ -408,8 +408,8 @@ Tensor _trilinear(const Tensor& i1_, const Tensor& i2_, const Tensor& i3_,
  if (! sumdim[unroll_dim]) {
    for (int64_t k = 0; k < unroll_size; k++) {
      Tensor buf = at::native::sumproduct_pair(i1.narrow(unroll_dim, k * slicemul1, 1),
-					       i2.narrow(unroll_dim, k * slicemul2, 1),
-					       sum_dims_12, true);
+                                               i2.narrow(unroll_dim, k * slicemul2, 1),
+                                               sum_dims_12, true);
      buf = at::native::sumproduct_pair(buf, i3.narrow(unroll_dim, k * slicemul3, 1), sum_dims_23, true);
      output.narrow(unroll_dim, k, 1).add_(buf);
    }
@ -417,7 +417,7 @@ Tensor _trilinear(const Tensor& i1_, const Tensor& i2_, const Tensor& i3_,
  else {
    for (int64_t k = 0; k < unroll_size; k++) {
      Tensor buf = at::native::sumproduct_pair(i1.narrow(unroll_dim, k*slicemul1, 1),
-					       i2.narrow(unroll_dim, k*slicemul2, 1), sum_dims_12, true);
+                                               i2.narrow(unroll_dim, k*slicemul2, 1), sum_dims_12, true);
      buf = at::native::sumproduct_pair(buf, i3.narrow(unroll_dim, k*slicemul3, 1), sum_dims_23, true);
      output.add_(buf);
    }
@ -473,7 +473,7 @@ Tensor tensordot(const Tensor& input1, const Tensor& input2, IntArrayRef dims1,
      t2 = t2.sum(dims2[i], true);
    } else {
      AT_CHECK(s1 == s2, "contracted dimensions need to match, but first has size ", s1, " in dim ", dims1[i],
-	       " and second has size ", s2, " in dim ", dims2[i]);
+               " and second has size ", s2, " in dim ", dims2[i]);
      csize *= s1;
    }
  }
--- a/aten/src/ATen/native/LossCTC.cpp
+++ b/aten/src/ATen/native/LossCTC.cpp
@ -61,7 +61,7 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
      tg_batch_offsets[i] = pos;
      pos += target_lengths[i];
      if (max_target_length < target_lengths[i])
-	 max_target_length = target_lengths[i];
+         max_target_length = target_lengths[i];
    }
    tg_target_stride = targets.stride(0);
    checkSize(c, targets_arg, 0, pos);
@ -83,8 +83,8 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
  int64_t max_input_length = log_probs.size(0);
  for (int64_t b = 0; b < batch_size; b++) {
    AT_CHECK(input_lengths[b] <= max_input_length,
-	     "Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", input_lengths[b], " for ", log_probs_arg,
-	     " (while checking arguments for ", c, ")");
+             "Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", input_lengths[b], " for ", log_probs_arg,
+             " (while checking arguments for ", c, ")");
  }

  Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2*max_target_length+1}, log_probs.options());
@ -115,11 +115,11 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
    // now the loop over the inputs
    for (int64_t t=1; t<input_length; t++) {
      for (int64_t s=0; s<2*target_length+1; s++) {
-	auto current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
-	// this loop over s could be parallel/vectorized, too, but the required items are one index apart
-	// alternatively, one might consider moving s to the outer loop to cache current_target_prime more (but then it needs to be descending)
-	// for the cuda implementation, that gave a speed boost.
-	// This is eq (6) and (7), la1,2,3 are the three summands. We keep track of the maximum for the logsumexp calculation.
+        auto current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
+        // this loop over s could be parallel/vectorized, too, but the required items are one index apart
+        // alternatively, one might consider moving s to the outer loop to cache current_target_prime more (but then it needs to be descending)
+        // for the cuda implementation, that gave a speed boost.
+        // This is eq (6) and (7), la1,2,3 are the three summands. We keep track of the maximum for the logsumexp calculation.

        scalar_t la1 = log_alpha_a[t-1][s];
        scalar_t lamax = la1;
@ -141,7 +141,7 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const
        }
        if (lamax == neginf) // cannot do neginf-neginf
          lamax = 0;
-	// this is the assignment of eq (6)
+        // this is the assignment of eq (6)
        log_alpha_a[t][s] = std::log(std::exp(la1-lamax)+std::exp(la2-lamax)+std::exp(la3-lamax))+lamax + log_probs_a[t][current_target_prime];
      }
    }
@ -182,7 +182,7 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_
      tg_batch_offsets[i] = pos;
      pos += target_lengths[i];
      if (max_target_length < target_lengths[i])
-	max_target_length = target_lengths[i];
+        max_target_length = target_lengths[i];
    }
    tg_target_stride = targets.stride(0);
  }
@ -268,9 +268,9 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_

        log_beta_a[t][s] = std::log(std::exp(lb1-lbmax)+std::exp(lb2-lbmax)+std::exp(lb3-lbmax))+lbmax + log_probs_a[t][current_target_prime];
        // one might check whether one can vectorize this better when done after the t-loop...
-	// now that we have beta, we fill in the sum of alpha*beta in eq (16)
-	// in contrast to the cuda implementation, we only parallelize over the batch, so we don't have a concurrency
-	// issue (several s can map to the same target character)
+        // now that we have beta, we fill in the sum of alpha*beta in eq (16)
+        // in contrast to the cuda implementation, we only parallelize over the batch, so we don't have a concurrency
+        // issue (several s can map to the same target character)
        // collected[b, t, target'[s]] "log+=" log_alpha[t, s]+log_beta[t, s]
        scalar_t log_alpha_beta =  log_alpha_a[t][s] + log_beta_a[t][s];
        scalar_t &lcab = grad_a[t][current_target_prime];
@ -309,9 +309,9 @@ std::tuple<Tensor, Tensor> ctc_loss_cpu(const Tensor& log_probs, const Tensor& t
  (void)zero_infinity; // only used for backwards
  return AT_DISPATCH_FLOATING_TYPES(log_probs.scalar_type(), "ctc_loss_cpu", [&] {
      if (targets.scalar_type() == kLong) {
-	return ctc_loss_cpu_template<scalar_t, kLong>(log_probs, targets, input_lengths, target_lengths, BLANK);
+        return ctc_loss_cpu_template<scalar_t, kLong>(log_probs, targets, input_lengths, target_lengths, BLANK);
      } else {
-	return ctc_loss_cpu_template<scalar_t, kInt>(log_probs, targets, input_lengths, target_lengths, BLANK);
+        return ctc_loss_cpu_template<scalar_t, kInt>(log_probs, targets, input_lengths, target_lengths, BLANK);
      }
  });
 }
@ -320,9 +320,9 @@ Tensor ctc_loss_backward_cpu(const Tensor& grad, const Tensor& log_probs, const
                             const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK, bool zero_infinity) {
  return AT_DISPATCH_FLOATING_TYPES(log_probs.scalar_type(), "ctc_loss_backward_cpu", [&] {
      if (targets.scalar_type() == kLong) {
-	return ctc_loss_backward_cpu_template<scalar_t,kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
+        return ctc_loss_backward_cpu_template<scalar_t,kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
      } else {
-	return ctc_loss_backward_cpu_template<scalar_t,kInt>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
+        return ctc_loss_backward_cpu_template<scalar_t,kInt>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
      }
  });
 }
--- a/aten/src/ATen/native/NNPACK.cpp
+++ b/aten/src/ATen/native/NNPACK.cpp
@ -76,11 +76,11 @@ pthreadpool_t nnpack_threadpool() {
    enum nnp_status nnpack_status = nnp_initialize();
    if (nnpack_status != nnp_status_success) {
      if (nnpack_status == nnp_status_out_of_memory) {
-	throw std::runtime_error("could not initialize NNPack (out of memory)");
+        throw std::runtime_error("could not initialize NNPack (out of memory)");
      } else if (nnpack_status == nnp_status_unsupported_hardware) {
-	throw std::runtime_error("could not initialize NNPack (unsupported hardware)");
+        throw std::runtime_error("could not initialize NNPack (unsupported hardware)");
      } else {
-	throw std::runtime_error("could not initialize NNPack (unknown error)");
+        throw std::runtime_error("could not initialize NNPack (unknown error)");
      }
    }
    unsigned int threads;
--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@ -614,7 +614,7 @@ std::tuple<Tensor, Tensor> NAME(                                               \
            num_layers, dropout_p, train, bidirectional, batch_first);         \
    return std::make_tuple(output, hy);                                        \
  }                                                                            \
-  check_device(_input, _params, hx);					\
+  check_device(_input, _params, hx);                                        \
  auto input = batch_first ? _input.transpose(0, 1) : _input;                  \
  auto params = gather_params(_params, has_biases);                            \
  auto results = _rnn_impl_with_concat<CELL, FullLayer, FullBidirectionalLayer>( \
--- a/aten/src/ATen/native/RangeFactories.cpp
+++ b/aten/src/ATen/native/RangeFactories.cpp
@ -126,10 +126,10 @@ Tensor& arange_cpu_out(Tensor& result, Scalar start, Scalar end, Scalar step) {
    double size_d;
    if (std::is_same<scalar_t, int64_t>::value) {
      size_d = std::ceil(static_cast<double>(end.to<accscalar_t>() - start.to<accscalar_t>())
-			 / step.to<accscalar_t>());
+                         / step.to<accscalar_t>());
    } else {
      size_d = std::ceil(static_cast<double>(end.to<double>() - start.to<double>())
-			 / step.to<double>());
+                         / step.to<double>());
    }

    AT_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
--- a/aten/src/ATen/native/cpu/avx_mathfun.h
+++ b/aten/src/ATen/native/cpu/avx_mathfun.h
@ -100,7 +100,7 @@ typedef union imm_xmm_union {

 #define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) {    \
    imm_xmm_union u __attribute__((aligned(32)));  \
-    u.imm = imm_;				   \
+    u.imm = imm_;                                   \
    xmm0_ = u.xmm[0];                            \
    xmm1_ = u.xmm[1];                            \
 }
@ -228,8 +228,8 @@ inline v8sf log256_ps(v8sf x) {
  return x;
 }

-_PS256_CONST(exp_hi,	88.3762626647949f);
-_PS256_CONST(exp_lo,	-88.3762626647949f);
+_PS256_CONST(exp_hi,        88.3762626647949f);
+_PS256_CONST(exp_lo,        -88.3762626647949f);

 _PS256_CONST(cephes_LOG2EF, 1.44269504088896341);
 _PS256_CONST(cephes_exp_C1, 0.693359375);
--- a/aten/src/ATen/native/cuda/CuFFTPlanCache.h
+++ b/aten/src/ATen/native/cuda/CuFFTPlanCache.h
@ -266,7 +266,7 @@ public:
      CUFFT_CHECK(hipfftMakePlanMany(plan(), signal_ndim, signal_sizes.data(),
        /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1,
        /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1,
-	exec_type, batch, &ws_size_t));
+        exec_type, batch, &ws_size_t));
 #else
      CUFFT_CHECK(cufftXtMakePlanMany(plan(), signal_ndim, signal_sizes.data(),
        /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype,
--- a/aten/src/ATen/native/cuda/Embedding.cu
+++ b/aten/src/ATen/native/cuda/Embedding.cu
@ -87,10 +87,10 @@ __global__ void embedding_backward_feature_kernel
          match_found_this_thread = 0;
 #ifdef __HIP_PLATFORM_HCC__
        unsigned long long int matchmask = WARP_BALLOT(match_found_this_thread);
-	int first_remaining_peer = __ffsll(matchmask) - 1;
+        int first_remaining_peer = __ffsll(matchmask) - 1;
 #else
        unsigned int matchmask = WARP_BALLOT(match_found_this_thread);
-	int first_remaining_peer = __ffs(matchmask) - 1;
+        int first_remaining_peer = __ffs(matchmask) - 1;
 #endif

        if(threadIdx.y == first_remaining_peer) // Nominate lowest-indexed warp as the leader
@ -103,7 +103,7 @@ __global__ void embedding_backward_feature_kernel
 #else
            first_remaining_peer = __ffs(matchmask) - 1;
 #endif
-	    my_s[threadIdx.x] += smem[threadIdx.x + WARP_SIZE*first_remaining_peer];
+            my_s[threadIdx.x] += smem[threadIdx.x + WARP_SIZE*first_remaining_peer];
            matchmask ^= (1 << first_remaining_peer);
          }
          if(f < s)
--- a/aten/src/ATen/native/cuda/LossCTC.cu
+++ b/aten/src/ATen/native/cuda/LossCTC.cu
@ -110,8 +110,8 @@ ctc_loss_log_alpha_gpu_kernel(scalar_t* __restrict__ log_alpha_data,
    for (int64_t t=1; t < max_input_length; t++) {
      __syncthreads(); // on cuda 9 we might use partial synchronization of only the threads within the same batch
      if ((t < input_length) && (target_length > 0) && (s < 2*target_length+1)) {
-	// only for valid t, s. This is equation (6) and (7), la1, la2, la3 are the three summands,
-	// lamax is the maximum for the logsumexp trick.
+        // only for valid t, s. This is equation (6) and (7), la1, la2, la3 are the three summands,
+        // lamax is the maximum for the logsumexp trick.
        scalar_t la1 = log_alpha_data[la_batch_offset + la_input_stride * (t-1) + la_target_stride * s];
        scalar_t lamax = la1;
        scalar_t la2, la3;
@ -135,7 +135,7 @@ ctc_loss_log_alpha_gpu_kernel(scalar_t* __restrict__ log_alpha_data,
        log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * s] = std::log(std::exp(la1-lamax)+std::exp(la2-lamax)+std::exp(la3-lamax))+lamax
          + log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * current_char];
      } else {
-	// otherwise we just set to neginf
+        // otherwise we just set to neginf
        if (s < 2*max_target_length+1)
          log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * s] = neginf;
      }
@ -218,8 +218,8 @@ std::tuple<Tensor, Tensor> ctc_loss_gpu_template(const Tensor& log_probs, const
  int64_t max_input_length = log_probs.size(0);
  for (int64_t b = 0; b < batch_size; b++) {
    AT_CHECK(input_lengths[b] <= max_input_length,
-	     "Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", targets.size(0), " for ", targets_arg,
-	     " (while checking arguments for ", c, ")");
+             "Expected tensor to have size at least ", max_input_length, " at dimension 1, but got size ", targets.size(0), " for ", targets_arg,
+             " (while checking arguments for ", c, ")");
  }

  auto target_lengths_t = at::tensor(target_lengths, targets.options().dtype(kLong));
@ -242,7 +242,7 @@ std::tuple<Tensor, Tensor> ctc_loss_gpu_template(const Tensor& log_probs, const
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  ctc_loss_log_alpha_gpu_kernel<scalar_t, target_t><<<grid, block, 0, stream>>>(
-		      log_alpha.data<scalar_t>(),
+                      log_alpha.data<scalar_t>(),
                      log_probs.data<scalar_t>(), input_lengths_t.data<int64_t>(), log_probs.size(0),
                      targets.data<target_t>(), target_lengths_t.data<int64_t>(), max_target_length,
                      neg_log_likelihood.data<scalar_t>(),
@ -304,8 +304,8 @@ ctc_loss_backward_log_beta_gpu_kernel(scalar_t* __restrict__ log_beta_data,
    if (s < 2*target_length+1) {
      current_target_prime = get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s, BLANK);
      have_three = ((s < 2*target_length-1) &&
-		    (get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s+2, BLANK) !=
-		     current_target_prime));
+                    (get_target_prime(targets_data, tg_batch_offset, tg_target_stride, s+2, BLANK) !=
+                     current_target_prime));
    } else {
      current_target_prime = BLANK;
      have_three = false;
@ -377,7 +377,7 @@ ctc_loss_backward_collect_nonblank_gpu_kernel(scalar_t* __restrict__ gradient_da
                                                     int64_t la_batch_stride, int64_t la_input_stride, int64_t la_target_stride,
                                                     int64_t lb_batch_stride, int64_t lb_input_stride, int64_t lb_target_stride,
                                                     const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride,
-					      int64_t batch_size, int64_t num_labels, int64_t BLANK, bool zero_infinity) {
+                                              int64_t batch_size, int64_t num_labels, int64_t BLANK, bool zero_infinity) {
  int64_t b = threadIdx.y + blockIdx.y * blockDim.y;
  int64_t s = threadIdx.x + blockIdx.x * blockDim.y; // note, this directly indexes into targets, no targets prime!

@ -405,9 +405,9 @@ ctc_loss_backward_collect_nonblank_gpu_kernel(scalar_t* __restrict__ gradient_da
  for (int64_t t = 0; t < input_length; t++) {
    scalar_t lp = log_probs_data[lp_batch_offset + t * lp_input_stride + lp_char_stride * target];
    atomicAdd(&gradient_data[gr_batch_offset + t * gr_input_stride + gr_char_stride * target],
-	      -std::exp(log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * (s*2+1)]
-			+ log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * (s*2+1)]
-			+ nll - lp) * gr);
+              -std::exp(log_alpha_data[la_batch_offset + la_input_stride * t + la_target_stride * (s*2+1)]
+                        + log_beta_data[lb_batch_offset + lb_input_stride * t + lb_target_stride * (s*2+1)]
+                        + nll - lp) * gr);
  }
 }

@ -429,7 +429,7 @@ ctc_loss_backward_collect_gpu_kernel(scalar_t* __restrict__ gradient_data,
                                                     int64_t la_batch_stride, int64_t la_input_stride, int64_t la_target_stride,
                                                     int64_t lb_batch_stride, int64_t lb_input_stride, int64_t lb_target_stride,
                                                     const int64_t* __restrict__ tg_batch_offsets, int64_t tg_target_stride,
-				     int64_t batch_size, int64_t num_labels, int64_t BLANK, bool zero_infinity) {
+                                     int64_t batch_size, int64_t num_labels, int64_t BLANK, bool zero_infinity) {

  constexpr scalar_t neginf = -INFINITY;
  int64_t b = threadIdx.y + blockIdx.y * blockDim.y;
@ -481,7 +481,7 @@ ctc_loss_backward_collect_gpu_kernel(scalar_t* __restrict__ gradient_data,
 // We don't do a lot of checking as we envision this to be called only when backpropagating through a (well-checked) forward.
 template<typename scalar_t, ScalarType target_scalar_type>
 Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_probs, const Tensor& targets, IntArrayRef input_lengths, IntArrayRef target_lengths,
-				      const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK, bool zero_infinity) {
+                                      const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK, bool zero_infinity) {
  constexpr scalar_t neginf = -INFINITY;
  using target_t = typename std::conditional<target_scalar_type == kInt, int, int64_t>::type;
  int64_t batch_size = log_probs.size(1);
@ -500,7 +500,7 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_
      tg_batch_offsets_data[i] = pos;
      pos += target_lengths[i];
      if (max_target_length < target_lengths[i])
-	max_target_length = target_lengths[i];
+        max_target_length = target_lengths[i];
    }
    tg_target_stride = targets.stride(0);
  }
@ -558,15 +558,15 @@ Tensor ctc_loss_backward_gpu_template(const Tensor& grad_out, const Tensor& log_
    // maybe we should kernelize this, too.
    auto grad_blank = grad.narrow(2, BLANK, 1);
    grad_blank -= (at::logsumexp(log_alpha.as_strided({batch_size, log_alpha.size(1), max_target_length+1},
-						      {log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2)*2})
-				 + log_beta.as_strided({batch_size, log_beta.size(1), max_target_length+1},
-						       {log_beta.stride(0), log_beta.stride(1), log_beta.stride(2)*2}),
-				 2, true)
-		   .permute({1, 0, 2})
-		   .add_(neg_log_likelihood.view({1, batch_size, 1}))
-		   .sub_(log_probs.narrow(2, BLANK, 1))
-		   .exp_()
-		   );
+                                                      {log_alpha.stride(0), log_alpha.stride(1), log_alpha.stride(2)*2})
+                                 + log_beta.as_strided({batch_size, log_beta.size(1), max_target_length+1},
+                                                       {log_beta.stride(0), log_beta.stride(1), log_beta.stride(2)*2}),
+                                 2, true)
+                   .permute({1, 0, 2})
+                   .add_(neg_log_likelihood.view({1, batch_size, 1}))
+                   .sub_(log_probs.narrow(2, BLANK, 1))
+                   .exp_()
+                   );
    // scale by output gradient (blanks and first summand of non-blanks)
    grad *= grad_out.view({1, batch_size, 1});
    if (zero_infinity) {
@ -630,9 +630,9 @@ std::tuple<Tensor, Tensor> ctc_loss_gpu(const Tensor& log_probs, const Tensor& t
  (void)zero_infinity; // only used for backward
  return AT_DISPATCH_FLOATING_TYPES(log_probs.scalar_type(), "ctc_loss_cuda", [&] {
      if (targets.scalar_type() == kLong) {
-	return ctc_loss_gpu_template<scalar_t, kLong>(log_probs, targets, input_lengths, target_lengths, BLANK);
+        return ctc_loss_gpu_template<scalar_t, kLong>(log_probs, targets, input_lengths, target_lengths, BLANK);
      } else {
-	return ctc_loss_gpu_template<scalar_t, kInt>(log_probs, targets, input_lengths, target_lengths, BLANK);
+        return ctc_loss_gpu_template<scalar_t, kInt>(log_probs, targets, input_lengths, target_lengths, BLANK);
      }
    });
 }
@ -641,9 +641,9 @@ Tensor ctc_loss_backward_gpu(const Tensor& grad, const Tensor& log_probs, const
                             const Tensor& neg_log_likelihood, const Tensor& log_alpha, int64_t BLANK, bool zero_infinity) {
  return AT_DISPATCH_FLOATING_TYPES(log_probs.scalar_type(), "ctc_loss_backward_cuda", [&] {
      if (targets.scalar_type() == kLong) {
-	return ctc_loss_backward_gpu_template<scalar_t, kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
+        return ctc_loss_backward_gpu_template<scalar_t, kLong>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
      } else {
-	return ctc_loss_backward_gpu_template<scalar_t, kInt>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
+        return ctc_loss_backward_gpu_template<scalar_t, kInt>(grad, log_probs, targets, input_lengths, target_lengths, neg_log_likelihood, log_alpha, BLANK, zero_infinity);
      }
    });
 }
--- a/aten/src/ATen/native/cuda/RangeFactories.cu
+++ b/aten/src/ATen/native/cuda/RangeFactories.cu
@ -146,10 +146,10 @@ Tensor& arange_cuda_out(Tensor& result, Scalar start, Scalar end, Scalar step) {
    double size_d;
    if (std::is_same<scalar_t, int64_t>::value) {
      size_d = std::ceil(static_cast<double>(end.to<accscalar_t>() - start.to<accscalar_t>())
-			 / step.to<accscalar_t>());
+                         / step.to<accscalar_t>());
    } else {
      size_d = std::ceil(static_cast<double>(end.to<double>() - start.to<double>())
-			 / step.to<double>());
+                         / step.to<double>());
    }

    AT_CHECK(xstep > 0 || xstep < 0, "step must be nonzero");
--- a/aten/src/ATen/native/cuda/WeightNorm.cu
+++ b/aten/src/ATen/native/cuda/WeightNorm.cu
@ -441,18 +441,18 @@ std::tuple<Tensor, Tensor> weight_norm_cuda_backward
       {
         using accscalar_t = acc_type<scalar_t, true>;

-	 weight_norm_bwd_first_dim_kernel<scalar_t, accscalar_t>
-	   <<<grad_w.size(0),
-	      BLOCK,
-	      BLOCK*sizeof(accscalar_t),
+         weight_norm_bwd_first_dim_kernel<scalar_t, accscalar_t>
+           <<<grad_w.size(0),
+              BLOCK,
+              BLOCK*sizeof(accscalar_t),
              stream>>>
-	   (grad_v.data<scalar_t>(),
-	    grad_g.data<scalar_t>(),
-	    grad_w.data<scalar_t>(),
-	    saved_v.data<scalar_t>(),
-	    saved_g.data<scalar_t>(),
-	    saved_norms.data<accscalar_t>(),
-	    rowSize);
+           (grad_v.data<scalar_t>(),
+            grad_g.data<scalar_t>(),
+            grad_w.data<scalar_t>(),
+            saved_v.data<scalar_t>(),
+            saved_g.data<scalar_t>(),
+            saved_norms.data<accscalar_t>(),
+            rowSize);
       });
  }
  else if(dim == ndims - 1)
--- a/aten/src/ATen/native/cudnn/LossCTC.cpp
+++ b/aten/src/ATen/native/cudnn/LossCTC.cpp
@ -72,17 +72,17 @@ std::tuple<Tensor, Tensor> _cudnn_ctc_loss(const Tensor& log_probs_t, const Tens

  size_t workspace_size;
  AT_CUDNN_CHECK(cudnnGetCTCLossWorkspaceSize(handle, probs_desc.desc(), grad_desc.desc(),
-					      targets->data<int>(), target_lengths.data(), input_lengths.data(),
-					      algo, ctc_loss_desc.desc(), &workspace_size));
+                                              targets->data<int>(), target_lengths.data(), input_lengths.data(),
+                                              algo, ctc_loss_desc.desc(), &workspace_size));


  Tensor workspace = at::empty(workspace_size, log_probs->options().dtype(kByte));
  Tensor costs = at::empty({log_probs->size(1)}, log_probs->options());

  AT_CUDNN_CHECK(cudnnCTCLoss(handle, probs_desc.desc(), probs.data_ptr(),
-			      targets->data<int>(), target_lengths.data(), input_lengths.data(),
-			      costs.data_ptr(), grad_desc.desc(), grad.data_ptr(), algo,
-			      ctc_loss_desc.desc(), workspace.data_ptr(), workspace_size));
+                              targets->data<int>(), target_lengths.data(), input_lengths.data(),
+                              costs.data_ptr(), grad_desc.desc(), grad.data_ptr(), algo,
+                              ctc_loss_desc.desc(), workspace.data_ptr(), workspace_size));

  return std::make_tuple(costs, grad);
 }
--- a/aten/src/ATen/native/miopen/Conv_miopen.cpp
+++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp
@ -456,7 +456,7 @@ struct algorithm_search<miopenConvFwdAlgorithm_t> {
        args.wdesc.desc(), args.weight.data_ptr(),
        args.cdesc.desc(),
        args.odesc.desc(), args.output.data_ptr(),
-        1,	// just return the fastest
+        1,        // just return the fastest
        &perf_count,
        &perf_results,
        ws.data,
--- a/aten/src/TH/THLapack.h
+++ b/aten/src/TH/THLapack.h
@ -5,12 +5,12 @@

 #define THLapack_(NAME) TH_CONCAT_4(TH,Real,Lapack_,NAME)

-#define THLapackCheck(fmt, func, info , ...)						\
-if (info < 0) {														\
+#define THLapackCheck(fmt, func, info , ...)                                                \
+if (info < 0) {                                                                                                                \
  THError("Lapack Error in %s : Illegal Argument %d", func, -info); \
-} else if(info > 0) {												\
-  THError(fmt, func, info, ##__VA_ARGS__);							\
-}																	\
+} else if(info > 0) {                                                                                                \
+  THError(fmt, func, info, ##__VA_ARGS__);                                                        \
+}                                                                                                                                        \

 #define THLapackCheckWithCleanup(fmt, cleanup, func, info , ...)    \
 if (info < 0) {                                                     \
--- a/aten/src/TH/THMemoryFile.cpp
+++ b/aten/src/TH/THMemoryFile.cpp
@ -14,7 +14,7 @@ typedef struct THMemoryFile__
    THCharStorage *storage;
    ssize_t size;
    ssize_t position;
-	int longSize;
+        int longSize;

 } THMemoryFile;

--- a/aten/src/TH/generic/THTensorLapack.cpp
+++ b/aten/src/TH/generic/THTensorLapack.cpp
@ -149,13 +149,13 @@ void THTensor_(gels)(THTensor *rb_, THTensor *ra_, THTensor *b, THTensor *a)

  /* get optimal workspace size */
  THLapack_(gels)('N', m, n, nrhs, ra__->data<scalar_t>(), lda,
-		  rb__->data<scalar_t>(), ldb,
-		  &wkopt, -1, &info);
+                  rb__->data<scalar_t>(), ldb,
+                  &wkopt, -1, &info);
  lwork = (int)wkopt;
  work = THTensor_(newWithSize1d)(lwork);
  THLapack_(gels)('N', m, n, nrhs, ra__->data<scalar_t>(), lda,
-		  rb__->data<scalar_t>(), ldb,
-		  work->data<scalar_t>(), lwork, &info);
+                  rb__->data<scalar_t>(), ldb,
+                  work->data<scalar_t>(), lwork, &info);

  THLapackCheckWithCleanup("Lapack Error in %s : The %d-th diagonal element of the triangular factor of A is zero",
                           THCleanup(c10::raw::intrusive_ptr::decref(ra__);
@ -378,21 +378,21 @@ void THTensor_(gesdd2)(THTensor *ru_, THTensor *rs_, THTensor *rv_, THTensor *ra
  }

  THLapack_(gesdd)(jobz,
-	     m,n,ra__->data<scalar_t>(),lda,
-	     rs__data,
-	     ru__data,
-	     ldu,
-	     rv__data, ldvt,
-	     &wkopt, -1, THIntTensor_data(iwork), &info);
+             m,n,ra__->data<scalar_t>(),lda,
+             rs__data,
+             ru__data,
+             ldu,
+             rv__data, ldvt,
+             &wkopt, -1, THIntTensor_data(iwork), &info);
  lwork = (int)wkopt;
  work = THTensor_(newWithSize1d)(lwork);
  THLapack_(gesdd)(jobz,
-	     m,n,ra__->data<scalar_t>(),lda,
-	     rs__data,
-	     ru__data,
-	     ldu,
-	     rv__data, ldvt,
-	     work->data<scalar_t>(),lwork, THIntTensor_data(iwork), &info);
+             m,n,ra__->data<scalar_t>(),lda,
+             rs__data,
+             ru__data,
+             ldu,
+             rv__data, ldvt,
+             work->data<scalar_t>(),lwork, THIntTensor_data(iwork), &info);

  if (jobz != 'N') {
    THLapackCheckWithCleanup("Lapack Error %s : %d superdiagonals failed to converge.",
--- a/aten/src/TH/generic/THTensorMoreMath.cpp
+++ b/aten/src/TH/generic/THTensorMoreMath.cpp
@ -999,31 +999,31 @@ int THTensor_(equal)(THTensor *ta, THTensor* tb)
  return equal;
 }

-#define TENSOR_IMPLEMENT_LOGICAL(NAME,OP)				\
+#define TENSOR_IMPLEMENT_LOGICAL(NAME,OP)                                \
  void THTensor_(NAME##Value)(THByteTensor *r_, THTensor* t, scalar_t value) \
-  {									\
-    THByteTensor_resizeNd(r_, t->dim(), THTensor_getSizePtr(t), NULL);		\
-    TH_TENSOR_APPLY2(unsigned char, r_, scalar_t, t,			\
-		     *r__data = (*t_data OP value) ? 1 : 0;); \
-  }									\
-  void THTensor_(NAME##ValueT)(THTensor* r_, THTensor* t, scalar_t value)	\
-  {									\
-    THTensor_(resizeNd)(r_, t->dim(), THTensor_getSizePtr(t), NULL);		\
-    TH_TENSOR_APPLY2(scalar_t, r_, scalar_t, t,					\
-		     *r__data = (*t_data OP value) ? 1 : 0;); \
-  }									\
+  {                                                                        \
+    THByteTensor_resizeNd(r_, t->dim(), THTensor_getSizePtr(t), NULL);                \
+    TH_TENSOR_APPLY2(unsigned char, r_, scalar_t, t,                        \
+                     *r__data = (*t_data OP value) ? 1 : 0;); \
+  }                                                                        \
+  void THTensor_(NAME##ValueT)(THTensor* r_, THTensor* t, scalar_t value)        \
+  {                                                                        \
+    THTensor_(resizeNd)(r_, t->dim(), THTensor_getSizePtr(t), NULL);                \
+    TH_TENSOR_APPLY2(scalar_t, r_, scalar_t, t,                                        \
+                     *r__data = (*t_data OP value) ? 1 : 0;); \
+  }                                                                        \
  void THTensor_(NAME##Tensor)(THByteTensor *r_, THTensor *ta, THTensor *tb) \
-  {									\
-    THByteTensor_resizeNd(r_, ta->dim(), THTensor_getSizePtr(ta), NULL);		\
-    TH_TENSOR_APPLY3(unsigned char, r_, scalar_t, ta, scalar_t, tb,		\
-		     *r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
-  }									\
+  {                                                                        \
+    THByteTensor_resizeNd(r_, ta->dim(), THTensor_getSizePtr(ta), NULL);                \
+    TH_TENSOR_APPLY3(unsigned char, r_, scalar_t, ta, scalar_t, tb,                \
+                     *r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
+  }                                                                        \
  void THTensor_(NAME##TensorT)(THTensor *r_, THTensor *ta, THTensor *tb) \
-  {									\
-    THTensor_(resizeNd)(r_, ta->dim(), THTensor_getSizePtr(ta), NULL);		\
-    TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, ta, scalar_t, tb,			\
-		     *r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
-  }									\
+  {                                                                        \
+    THTensor_(resizeNd)(r_, ta->dim(), THTensor_getSizePtr(ta), NULL);                \
+    TH_TENSOR_APPLY3(scalar_t, r_, scalar_t, ta, scalar_t, tb,                        \
+                     *r__data = (*ta_data OP *tb_data) ? 1 : 0;); \
+  }                                                                        \


 TENSOR_IMPLEMENT_LOGICAL(lt,<)
@ -1302,10 +1302,10 @@ void THTensor_(norm)(THTensor *r_, THTensor *t, scalar_t value, int dimension, i
               *r__data = TH_MATH_NAME(pow)(sum, 1.0/3), 0);
  } else if (value == INFINITY) {
    DIM_REDUCE(sum = THMax(sum, TH_MATH_NAME(fabs)(t_data[i*t_stride])),
-	       *r__data = sum, 0);
+               *r__data = sum, 0);
  } else if (value == -INFINITY) {
    DIM_REDUCE(sum = THMin(sum, TH_MATH_NAME(fabs)(t_data[i*t_stride])),
-	       *r__data = sum, INFINITY);
+               *r__data = sum, INFINITY);
  } else {
    DIM_REDUCE(sum += TH_MATH_NAME(pow)(TH_MATH_NAME(fabs)(t_data[i*t_stride]), value),
               *r__data = TH_MATH_NAME(pow)(sum, 1.0/value), 0);
--- a/aten/src/TH/generic/THVector.h
+++ b/aten/src/TH/generic/THVector.h
@ -17,10 +17,10 @@ TH_API void THVector_(cdiv)(scalar_t *z, const scalar_t *x, const scalar_t *y, c
 TH_API void THVector_(divs)(scalar_t *y, const scalar_t *x, const scalar_t c, const ptrdiff_t n);
 TH_API void THVector_(neg)(scalar_t *y, const scalar_t *x, const ptrdiff_t n);
 TH_API void THVector_(normal_fill)(scalar_t *data,
-								   const int64_t size,
-								   struct THGenerator *generator,
-								   const scalar_t mean,
-								   const scalar_t stddev);
+                                                                   const int64_t size,
+                                                                   struct THGenerator *generator,
+                                                                   const scalar_t mean,
+                                                                   const scalar_t stddev);

 #endif /* non bool only part */

--- a/aten/src/TH/vector/VSX.cpp
+++ b/aten/src/TH/vector/VSX.cpp
@ -1342,7 +1342,7 @@ static void THFloatVector_divs_VSX(float *y, const float*x, const float c, const
 //    $ gcc VSX.c -O2 -D RUN_VSX_TESTS -o vsxtest
 //    $ ./vsxtest
 //
-//	TODO
+//        TODO
 //
 //
 //    Finished running all tests. All tests PASSED.
--- a/aten/src/TH/vector/simd.h
+++ b/aten/src/TH/vector/simd.h
@ -119,7 +119,7 @@ static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *
 #else
  uint32_t a = *eax, b, c = *ecx, d;
  asm volatile ( "cpuid\n\t"
-		 : "+a"(a), "=b"(b), "+c"(c), "=d"(d) );
+                 : "+a"(a), "=b"(b), "+c"(c), "=d"(d) );
  *eax = a;
  *ebx = b;
  *ecx = c;
--- a/aten/src/THC/THCBlas.cu
+++ b/aten/src/THC/THCBlas.cu
@ -308,12 +308,12 @@ void THCudaBlas_Hgemm(THCState *state, char transa, char transb, int64_t m, int6
      cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
      if (prop->major >= 5){
        THCublasCheck(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
-	THCublasCheck(cublasGemmEx(handle, opa, opb,
+        THCublasCheck(cublasGemmEx(handle, opa, opb,
                                   i_m, i_n, i_k, &fAlpha,
                                   a, CUDA_R_16F, i_lda, b, CUDA_R_16F,
                                   i_ldb, &fBeta, c, CUDA_R_16F, i_ldc,
                                   CUDA_R_32F, CUBLAS_GEMM_DFALT_TENSOR_OP));
-	THCublasCheck(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
+        THCublasCheck(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
      }else{
        THCublasCheck(cublasSgemmEx(handle, opa, opb,
                                    i_m, i_n, i_k, &fAlpha,
--- a/aten/src/THC/THCTensorMath.cuh
+++ b/aten/src/THC/THCTensorMath.cuh
@ -48,7 +48,7 @@ inline bool getCatGrid(THCState* state, ptrdiff_t nTensors, dim3& grid) {
  //X dim of grid for cat array cooperates on a single tensor in the cat.
  //Given half of the GPU, full utilization will always occur.
  grid = dim3( 2LL * numSM, (long long) nTensors );
-	     
+             
  return true;
 }

@ -131,7 +131,7 @@ __global__ void CatArrayBatchedCopy(

    while( tid < nElements){
    IndexType elementOffset = CatArrIndexToOffset<IndexType, Dims>::compute(
-    	      os.outputSize, os.outputStride, dimSize, concatDim, tid);
+                  os.outputSize, os.outputStride, dimSize, concatDim, tid);
    output[dataOffset + elementOffset] = data[tid];

    tid += stride;
--- a/aten/src/THC/THCTensorRandom.cuh
+++ b/aten/src/THC/THCTensorRandom.cuh
@ -79,7 +79,7 @@ condDiv(T *q, int64_t *J, int64_t inputsize, T q_max) {
      q[idx] = one;
    } else {
      if (THCNumerics<T>::gt(q_max, one)) {
-	q[idx] = THCNumerics<T>::div(q[idx], q_max);
+        q[idx] = THCNumerics<T>::div(q[idx], q_max);
      }
    }
  }
@ -236,7 +236,7 @@ sampleMultinomialOnce(int64_t* dest,
          THCNumerics<AccT>::div(
              ScalarConvert<T, AccT>::to(dist[curDist * stride_dist + cat * stride_categories]),
              sum) :
-	  accZero);
+          accZero);

      smem[threadIdx.x] = dist_val;
      __syncthreads();
--- a/aten/src/THC/generic/THCTensorMath.cu
+++ b/aten/src/THC/generic/THCTensorMath.cu
@ -42,7 +42,7 @@ THCTensor_(numel)(THCState *state, THCTensor *t)
 }

 void THCTensor_(cat)(THCState *state, THCTensor *result,
-		     THCTensor *ta, THCTensor *tb, int dimension)
+                     THCTensor *ta, THCTensor *tb, int dimension)
 {
  THCTensor* inputs[2];
  inputs[0] = ta;
@ -73,7 +73,7 @@ inline void THCTensor_(check_shape_except_dim)(THCState *state,
 }

 void THCTensor_(catArray)(THCState *state, THCTensor *result,
-			  THCTensor **inputs, int numInputs, int dimension)
+                          THCTensor **inputs, int numInputs, int dimension)
 {
  // previously, size [0] tensors were the only possible empty tensors; thus, it wasn't possible
  // to cat empty tensors unless all the other tensors were 1-dimensional, so we allowed these tensors
--- a/aten/src/THC/generic/THCTensorMathReduce.h
+++ b/aten/src/THC/generic/THCTensorMathReduce.h
@ -34,9 +34,9 @@ THC_API scalar_t THCTensor_(maxall)(THCState *state, THCTensor *self);
 THC_API scalar_t THCTensor_(medianall)(THCState *state, THCTensor *self);

 THC_API void THCTensor_(median)(THCState *state,
-				THCTensor *values,
-				THCudaLongTensor *indices,
-				THCTensor *src, int dim, int keepdim);
+                                THCTensor *values,
+                                THCudaLongTensor *indices,
+                                THCTensor *src, int dim, int keepdim);

 THC_API accreal THCTensor_(dist)(THCState *state, THCTensor *self, THCTensor *src,
                              scalar_t value);
--- a/aten/src/THC/generic/THCTensorRandom.cu
+++ b/aten/src/THC/generic/THCTensorRandom.cu
@ -249,7 +249,7 @@ void THCTensor_(multinomial)(struct THCState *state,
          THCudaLongTensor_data(state, self),
          numDist, numCategories,
          THCTensor_(data)(state, prefixSum),
-	  THCTensor_(data)(state, normDist));
+          THCTensor_(data)(state, normDist));
    } else {
      // Sample without replacement

--- a/aten/src/THCUNN/LogSigmoid.cu
+++ b/aten/src/THCUNN/LogSigmoid.cu
@ -7,12 +7,12 @@
 #define ZERO_MACRO zero<T>()
 template <typename T>
 inline __device__ typename std::enable_if<std::is_same<T, double>::value, T>::type zero() {
-	return 0.;
+        return 0.;
 }

 template <typename T>
 inline __device__ typename std::enable_if<!std::is_same<T, double>::value, T>::type zero() {
-	return 0.f;
+        return 0.f;
 }
 #else
 #define ZERO_MACRO 0.f
--- a/aten/src/THCUNN/LookupTable.cu
+++ b/aten/src/THCUNN/LookupTable.cu
@ -88,7 +88,7 @@ __global__ void cunn_LookupTable_accGradParametersKernelByFeature
 #else
            first_remaining_peer = __ffs(matchmask) - 1;
 #endif
-	    my_s[threadIdx.x] += smem[threadIdx.x + WARP_SIZE*first_remaining_peer];
+            my_s[threadIdx.x] += smem[threadIdx.x + WARP_SIZE*first_remaining_peer];
            matchmask ^= (1 << first_remaining_peer);
          }
          if(f < s)
--- a/aten/src/THCUNN/LookupTableBag.cu
+++ b/aten/src/THCUNN/LookupTableBag.cu
@ -49,14 +49,14 @@ __global__ void cunn_LookupTableBag_updateOutputKernel(
      for (int64_t emb = begin; emb < end; emb++) {
        const int weightRow = ((int) input[emb]) * stride;
        weightFeatSum += ScalarConvert<Dtype, Acctype>::to(weightFeat[weightRow]);
-	bag_size_ ++;
+        bag_size_ ++;
        if (featureDim == 0) {
          offset2bag[emb] = bag;
        }
      }
      if (mode == MODE_MEAN) {
-	weightFeatSum = weightFeatSum / ScalarConvert<int64_t, Acctype>::to(bag_size_);
-	bag_size[bag] = bag_size_;
+        weightFeatSum = weightFeatSum / ScalarConvert<int64_t, Acctype>::to(bag_size_);
+        bag_size[bag] = bag_size_;
      }
      (void) MODE_SUM; //silence warnings about unused MODE_SUM;
      output[bag * stride + featureDim] = ScalarConvert<Acctype, Dtype>::to(weightFeatSum);
@ -114,9 +114,9 @@ __global__ void cunn_LookupTableBag_accGradParametersKernel(
        if (featureDim < stride)
        {
          gradient[ii] = ScalarConvert<Dtype, Acctype>::to(gradOutput[gradOutputRow + featureDim]);
-	  if (mode == MODE_MEAN) {
-	    gradient[ii] /= bag_size[seq_number];
-	  }
+          if (mode == MODE_MEAN) {
+            gradient[ii] /= bag_size[seq_number];
+          }
          weight[ii] = ScalarConvert<Dtype, Acctype>::to(gradWeight[weightRow + featureDim]);
        }
      }
--- a/aten/src/THCUNN/SpatialUpSamplingNearest.cu
+++ b/aten/src/THCUNN/SpatialUpSamplingNearest.cu
@ -16,9 +16,9 @@ template<typename Dtype, typename Acctype>
 C10_LAUNCH_BOUNDS_1(1024)
 #endif
 __global__ void nearest_neighbor_4d_kernel(
-		const int n,
-		const THCDeviceTensor<Dtype, 4> data1,
-		THCDeviceTensor<Dtype, 4> data2) {
+                const int n,
+                const THCDeviceTensor<Dtype, 4> data1,
+                THCDeviceTensor<Dtype, 4> data2) {
  int index = threadIdx.x + blockIdx.x * blockDim.x;
  const int batchsize = data1.getSize(0);
  const int channels = data1.getSize(1);
@ -37,10 +37,10 @@ __global__ void nearest_neighbor_4d_kernel(
      const int h1 = h2;
      const int w1 = w2;
      for (int n = 0; n < batchsize; n++) {
-	for (int c = 0; c < channels; ++c) {
-	  const Dtype val = data1[n][c][h1][w1];
-	  data2[n][c][h2][w2] = val;
-	}
+        for (int c = 0; c < channels; ++c) {
+          const Dtype val = data1[n][c][h1][w1];
+          data2[n][c][h2][w2] = val;
+        }
      }
      return;
    }
@ -49,8 +49,8 @@ __global__ void nearest_neighbor_4d_kernel(
    const int w1 = nearest_neighbor_compute_source_index(width_scale, w2, width1);
    for (int n = 0; n < batchsize; n++) {
      for (int c = 0; c < channels; ++c) {
-	const Dtype val = data1[n][c][h1][w1];
-	data2[n][c][h2][w2] = val;
+        const Dtype val = data1[n][c][h1][w1];
+        data2[n][c][h2][w2] = val;
      }
    }
  }
@ -62,9 +62,9 @@ template <typename Dtype, typename Acctype>
 C10_LAUNCH_BOUNDS_1(1024)
 #endif
 __global__ void nearest_neighbor_4d_kernel_backward(
-		const int n,
-		THCDeviceTensor<Dtype, 4> data1,
-		const THCDeviceTensor<Dtype, 4> data2) {
+                const int n,
+                THCDeviceTensor<Dtype, 4> data1,
+                const THCDeviceTensor<Dtype, 4> data2) {
  int index = threadIdx.x + blockIdx.x * blockDim.x;
  const int batchsize = data1.getSize(0);
  const int channels = data1.getSize(1);
@ -83,10 +83,10 @@ __global__ void nearest_neighbor_4d_kernel_backward(
      const int h1 = h2;
      const int w1 = w2;
      for (int n = 0; n < batchsize; n++) {
-	for (int c = 0; c < channels; ++c) {
-	  const Dtype val = data2[n][c][h2][w2];
-	  data1[n][c][h1][w1] = val;
-	}
+        for (int c = 0; c < channels; ++c) {
+          const Dtype val = data2[n][c][h2][w2];
+          data1[n][c][h1][w1] = val;
+        }
      }
      return;
    }
--- a/aten/src/THCUNN/TemporalUpSamplingNearest.cu
+++ b/aten/src/THCUNN/TemporalUpSamplingNearest.cu
@ -16,9 +16,9 @@ template<typename Dtype, typename Acctype>
 C10_LAUNCH_BOUNDS_1(1024)
 #endif
 __global__ void nearest_neighbor_3d_kernel(
-		const int n,
-		const THCDeviceTensor<Dtype, 3> data1,
-		THCDeviceTensor<Dtype, 3> data2) {
+                const int n,
+                const THCDeviceTensor<Dtype, 3> data1,
+                THCDeviceTensor<Dtype, 3> data2) {
  int index = threadIdx.x + blockIdx.x * blockDim.x;
  const int batchsize = data1.getSize(0);
  const int channels = data1.getSize(1);
@ -32,10 +32,10 @@ __global__ void nearest_neighbor_3d_kernel(
    if (width1 == width2) {
      const int w1 = w2;
      for (int n = 0; n < batchsize; n++) {
-	for (int c = 0; c < channels; ++c) {
-	  const Dtype val = data1[n][c][w1];
-	  data2[n][c][w2] = val;
-	}
+        for (int c = 0; c < channels; ++c) {
+          const Dtype val = data1[n][c][w1];
+          data2[n][c][w2] = val;
+        }
      }
      return;
    }
@ -43,8 +43,8 @@ __global__ void nearest_neighbor_3d_kernel(
    const int w1 = nearest_neighbor_compute_source_index(scale, w2, width1);
    for (int n = 0; n < batchsize; n++) {
      for (int c = 0; c < channels; ++c) {
-	const Dtype val = data1[n][c][w1];
-	data2[n][c][w2] = val;
+        const Dtype val = data1[n][c][w1];
+        data2[n][c][w2] = val;
      }
    }
  }
@ -56,9 +56,9 @@ template <typename Dtype, typename Acctype>
 C10_LAUNCH_BOUNDS_1(1024)
 #endif
 __global__ void nearest_neighbor_3d_kernel_backward(
-		const int n,
-		THCDeviceTensor<Dtype, 3> data1,
-		const THCDeviceTensor<Dtype, 3> data2) {
+                const int n,
+                THCDeviceTensor<Dtype, 3> data1,
+                const THCDeviceTensor<Dtype, 3> data2) {
  int index = threadIdx.x + blockIdx.x * blockDim.x;
  const int batchsize = data1.getSize(0);
  const int channels = data1.getSize(1);
@ -72,10 +72,10 @@ __global__ void nearest_neighbor_3d_kernel_backward(
    if (width1 == width2) {
      const int w1 = w2;
      for (int n = 0; n < batchsize; n++) {
-	for (int c = 0; c < channels; ++c) {
-	  const Dtype val = data2[n][c][w1];
-	  data1[n][c][w2] = val;
-	}
+        for (int c = 0; c < channels; ++c) {
+          const Dtype val = data2[n][c][w1];
+          data1[n][c][w2] = val;
+        }
      }
      return;
    }
--- a/aten/src/THCUNN/VolumetricUpSamplingNearest.cu
+++ b/aten/src/THCUNN/VolumetricUpSamplingNearest.cu
@ -16,9 +16,9 @@ template<typename Dtype, typename Acctype>
 C10_LAUNCH_BOUNDS_1(1024)
 #endif
 __global__ void nearest_neighbor_5d_kernel(
-		const int n,
-		const THCDeviceTensor<Dtype, 5> data1,
-		THCDeviceTensor<Dtype, 5> data2) {
+                const int n,
+                const THCDeviceTensor<Dtype, 5> data1,
+                THCDeviceTensor<Dtype, 5> data2) {
  int index = threadIdx.x + blockIdx.x * blockDim.x;
  const int batchsize = data1.getSize(0);
  const int channels = data1.getSize(1);
@ -55,8 +55,8 @@ __global__ void nearest_neighbor_5d_kernel(
    const int d1 = nearest_neighbor_compute_source_index(depth_scale, d2, depth1);
    for (int n = 0; n < batchsize; n++) {
      for (int c = 0; c < channels; ++c) {
-	const Dtype val = data1[n][c][d1][h1][w1];
-	data2[n][c][d2][h2][w2] = val;
+        const Dtype val = data1[n][c][d1][h1][w1];
+        data2[n][c][d2][h2][w2] = val;
      }
    }
  }
@ -68,9 +68,9 @@ template <typename Dtype, typename Acctype>
 C10_LAUNCH_BOUNDS_1(1024)
 #endif
 __global__ void nearest_neighbor_5d_kernel_backward(
-		const int n,
-		THCDeviceTensor<Dtype, 5> data1,
-		const THCDeviceTensor<Dtype, 5> data2) {
+                const int n,
+                THCDeviceTensor<Dtype, 5> data1,
+                const THCDeviceTensor<Dtype, 5> data2) {
  int index = threadIdx.x + blockIdx.x * blockDim.x;
  const int batchsize = data1.getSize(0);
  const int channels = data1.getSize(1);
@ -108,8 +108,8 @@ __global__ void nearest_neighbor_5d_kernel_backward(
    const int d1 = nearest_neighbor_compute_source_index(depth_scale, d2, depth1);
    for (int n = 0; n < batchsize; n++) {
      for (int c = 0; c < channels; ++c) {
-	const Dtype val = data2[n][c][d2][h2][w2];
-	atomicAdd(data1[n][c][d1][h1][w1].data(), val);
+        const Dtype val = data2[n][c][d2][h2][w2];
+        atomicAdd(data1[n][c][d1][h1][w1].data(), val);
      }
    }
  }
--- a/aten/src/THCUNN/common.h
+++ b/aten/src/THCUNN/common.h
@ -24,7 +24,7 @@ inline int GET_BLOCKS(const int N)
  }

 #define THCUNN_check_shape(STATE, I1, I2)                 \
-  if (I1 != NULL && I2 != NULL && !THCTensor_(isSameSizeAs)(STATE, I1, I2))	\
+  if (I1 != NULL && I2 != NULL && !THCTensor_(isSameSizeAs)(STATE, I1, I2))        \
  { \
       THCDescBuff s1 = THCTensor_(sizeDesc)(STATE, I1);  \
       THCDescBuff s2 = THCTensor_(sizeDesc)(STATE, I2);  \
@ -47,20 +47,20 @@ inline int GET_BLOCKS(const int N)
    ptrdiff_t n1 = THCTensor_(nElement)(STATE, I1);              \
    ptrdiff_t n2 = THCTensor_(nElement)(STATE, I2);              \
    if (n1 != n2)                                           \
-    {	\
+    {        \
      THCDescBuff s1 = THCTensor_(sizeDesc)(state, I1);     \
      THCDescBuff s2 = THCTensor_(sizeDesc)(state, I2);     \
-      THError(#I1 " and " #I2 " have different number of elements: "	\
+      THError(#I1 " and " #I2 " have different number of elements: "        \
              #I1 "%s has %ld elements, while "             \
              #I2 "%s has %ld elements", s1.str, n1, s2.str, n2); \
-    }	\
+    }        \
  }

 #define THCUNN_check_dim_size(STATE, T, DIM, DIM_SIZE, SIZE) \
  if (THCTensor_(nDimensionLegacyNoScalars)(STATE, T) != DIM ||             \
      THCTensor_(sizeLegacyNoScalars)(STATE, T, DIM_SIZE) != SIZE) {        \
      THCDescBuff s1 = THCTensor_(sizeDesc)(state, T);       \
-      THError("Need " #T " of dimension %d and " #T ".size[%d] == %d"	\
+      THError("Need " #T " of dimension %d and " #T ".size[%d] == %d"        \
              " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
  }

--- a/aten/src/THCUNN/generic/LookupTableBag.cu
+++ b/aten/src/THCUNN/generic/LookupTableBag.cu
@ -10,7 +10,7 @@ void THNN_(LookupTableBag_updateOutput)(
           THCTensor *weight,
           THCTensor *output,
           THCIndexTensor *offset2bag,
-	   int mode,
+           int mode,
           THCIndexTensor *bag_size)
 {
  THCUNN_assertSameGPU(state, 5, input, offsets, weight, output, offset2bag);
@ -65,8 +65,8 @@ void THNN_(LookupTableBag_accGradParameters)(
           THCIndexTensor *sortedIndices,
           THCIndexTensor *origIndices,
           bool scaleGradByFreq,
-	   int mode,
-	   THCIndexTensor *bag_size,
+           int mode,
+           THCIndexTensor *bag_size,
           accreal scale_)
 {
  scalar_t scale = ScalarConvert<accreal, scalar_t>::to(scale_);
--- a/aten/src/THCUNN/generic/SpatialConvolutionMM.cu
+++ b/aten/src/THCUNN/generic/SpatialConvolutionMM.cu
@ -88,7 +88,7 @@ static THCTensor* THNN_(newViewWeightMM2d)(THCState *state, THCTensor *weight) {
    int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3);
    THCTensor *old_weight = weight;
    weight = THCTensor_(newWithStorage2d)(state, THTensor_getStoragePtr(weight), weight->storage_offset(),
-					  s1, -1, s2, -1);
+                                          s1, -1, s2, -1);
    THCTensor_(free)(state, old_weight);
  }
  return weight;
--- a/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu
+++ b/aten/src/THCUNN/generic/SpatialDilatedConvolution.cu
@ -11,7 +11,7 @@ static inline void THNN_(SpatialDilatedConvolution_shapeCheck)(
                         int kH, int kW, int dH, int dW, int padH, int padW,
                         int dilationH, int dilationW, int weight_nullable) {
  THArgCheck(kW > 0 && kH > 0, 9,
-	           "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+                   "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
  THArgCheck(dW > 0 && dH > 0, 11,
             "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
  THArgCheck(dilationW > 0 && dilationH > 0, 14,
--- a/aten/src/THCUNN/generic/SpatialUpSamplingNearest.cu
+++ b/aten/src/THCUNN/generic/SpatialUpSamplingNearest.cu
@ -34,7 +34,7 @@ void THNN_(SpatialUpSamplingNearest_updateOutput)(
           THCState *state,
           THCTensor *input,
           THCTensor *output,
-	   int outputHeight,
+           int outputHeight,
           int outputWidth)
 {
  THCUNN_assertSameGPU(state, 2, input, output);
@ -44,14 +44,14 @@ void THNN_(SpatialUpSamplingNearest_updateOutput)(
  int inputWidth  = THCTensor_(size)(state, input, 3);

  THNN_(SpatialUpSamplingNearest_shapeCheck)(state, input, NULL, nbatch, channels,
-		  inputHeight, inputWidth,
-		  outputHeight, outputWidth);
+                  inputHeight, inputWidth,
+                  outputHeight, outputWidth);
  THAssert(inputHeight > 0 && inputWidth > 0 && outputHeight > 0 && outputWidth > 0);

  THCTensor_(resize4d)(state, output,
                       THCTensor_(size)(state, input, 0),
                       THCTensor_(size)(state, input, 1),
-		       outputHeight,
+                       outputHeight,
                       outputWidth);
  THCTensor_(zero)(state, output);

@ -62,7 +62,7 @@ void THNN_(SpatialUpSamplingNearest_updateOutput)(
  const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
  cudaStream_t stream = THCState_getCurrentStream(state);
  nearest_neighbor_4d_kernel<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads,
-	 0, stream>>>(num_kernels, idata, odata);
+         0, stream>>>(num_kernels, idata, odata);
  THCudaCheck(cudaGetLastError());
 }

@ -73,15 +73,15 @@ void THNN_(SpatialUpSamplingNearest_updateGradInput)(
           THCTensor *gradOutput,
           THCTensor *gradInput,
           int nbatch,
-	   int nchannels,
-	   int inputHeight,
-	   int inputWidth,
-	   int outputHeight,
-	   int outputWidth)
+           int nchannels,
+           int inputHeight,
+           int inputWidth,
+           int outputHeight,
+           int outputWidth)
 {
  THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
  THNN_(SpatialUpSamplingNearest_shapeCheck)(state, NULL, gradOutput, nbatch, nchannels,
-		  inputHeight, inputWidth, outputHeight, outputWidth);
+                  inputHeight, inputWidth, outputHeight, outputWidth);
  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
  THCTensor_(resize4d)(state, gradInput, nbatch, nchannels, inputHeight, inputWidth);

@ -94,7 +94,7 @@ void THNN_(SpatialUpSamplingNearest_updateGradInput)(
  cudaStream_t stream = THCState_getCurrentStream(state);

  nearest_neighbor_4d_kernel_backward<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads),
-	  num_threads, 0, stream>>>(num_kernels, data1, data2);
+          num_threads, 0, stream>>>(num_kernels, data1, data2);
  THCudaCheck(cudaGetLastError());
  THCTensor_(free)(state, gradOutput);
 }
--- a/aten/src/THCUNN/generic/THCUNN.h
+++ b/aten/src/THCUNN/generic/THCUNN.h
@ -243,7 +243,7 @@ THC_API void THNN_(LookupTableBag_updateOutput)(
           THCTensor *weight,
           THCTensor *output,
           THCIndexTensor *offset2bag,
-	   int mode,
+           int mode,
           THCIndexTensor *seq_length);       // [OPTIONAL]

 THC_API void THNN_(LookupTableBag_accGradParameters)(
@ -256,8 +256,8 @@ THC_API void THNN_(LookupTableBag_accGradParameters)(
           THCIndexTensor *sortedIndices,
           THCIndexTensor *origIndices,
           bool scaleGradByFreq,
-	   int mode,
-	   THCIndexTensor *seq_length,        // [OPTIONAL]
+           int mode,
+           THCIndexTensor *seq_length,        // [OPTIONAL]
           accreal scale_);

 THC_API void THNN_(L1Cost_updateOutput)(
--- a/aten/src/THCUNN/generic/TemporalUpSamplingNearest.cu
+++ b/aten/src/THCUNN/generic/TemporalUpSamplingNearest.cu
@ -54,7 +54,7 @@ void THNN_(TemporalUpSamplingNearest_updateOutput)(
  const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
  cudaStream_t stream = THCState_getCurrentStream(state);
  nearest_neighbor_3d_kernel<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads,
-	 0, stream>>>(num_kernels, idata, odata);
+         0, stream>>>(num_kernels, idata, odata);
  THCudaCheck(cudaGetLastError());
 }

@ -82,7 +82,7 @@ void THNN_(TemporalUpSamplingNearest_updateGradInput)(
  cudaStream_t stream = THCState_getCurrentStream(state);

  nearest_neighbor_3d_kernel_backward<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads),
-	  num_threads, 0, stream>>>(num_kernels, data1, data2);
+          num_threads, 0, stream>>>(num_kernels, data1, data2);

  THCudaCheck(cudaGetLastError());
  THCTensor_(free)(state, gradOutput);
--- a/aten/src/THCUNN/generic/VolumetricUpSamplingNearest.cu
+++ b/aten/src/THCUNN/generic/VolumetricUpSamplingNearest.cu
@ -47,10 +47,10 @@ void THNN_(VolumetricUpSamplingNearest_updateOutput)(
  int inputWidth  = THCTensor_(size)(state, input, 4);

  THNN_(VolumetricUpSamplingNearest_shapeCheck)(state, input, NULL, nbatch, channels,
-		  inputDepth, inputHeight, inputWidth,
-		  outputDepth, outputHeight, outputWidth);
+                  inputDepth, inputHeight, inputWidth,
+                  outputDepth, outputHeight, outputWidth);
  THAssert(inputDepth > 0 && inputHeight > 0 && inputWidth > 0 &&
-		  outputDepth > 0 && outputHeight > 0 && outputWidth > 0);
+                  outputDepth > 0 && outputHeight > 0 && outputWidth > 0);

  THCTensor_(resize5d)(state, output,
                       THCTensor_(size)(state, input, 0),
@ -67,7 +67,7 @@ void THNN_(VolumetricUpSamplingNearest_updateOutput)(
  const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
  cudaStream_t stream = THCState_getCurrentStream(state);
  nearest_neighbor_5d_kernel<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads), num_threads,
-	 0, stream>>>(num_kernels, idata, odata);
+         0, stream>>>(num_kernels, idata, odata);
  THCudaCheck(cudaGetLastError());
 }

@ -88,8 +88,8 @@ void THNN_(VolumetricUpSamplingNearest_updateGradInput)(
 {
  THCUNN_assertSameGPU(state, 2, gradOutput, gradInput);
  THNN_(VolumetricUpSamplingNearest_shapeCheck)(state, NULL, gradOutput, nbatch, nchannels,
-		  inputDepth, inputHeight, inputWidth,
-		  outputDepth, outputHeight, outputWidth);
+                  inputDepth, inputHeight, inputWidth,
+                  outputDepth, outputHeight, outputWidth);
  gradOutput = THCTensor_(newContiguous)(state, gradOutput);
  THCTensor_(resize5d)(state, gradInput, nbatch, nchannels, inputDepth, inputHeight, inputWidth);

@ -100,7 +100,7 @@ void THNN_(VolumetricUpSamplingNearest_updateGradInput)(
  const int num_threads = at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;
  cudaStream_t stream = THCState_getCurrentStream(state);
  nearest_neighbor_5d_kernel_backward<scalar_t, accreal> <<<THCCeilDiv(num_kernels, num_threads),
-	  num_threads, 0, stream>>>(num_kernels, data1, data2);
+          num_threads, 0, stream>>>(num_kernels, data1, data2);
  THCudaCheck(cudaGetLastError());
  THCTensor_(free)(state, gradOutput);
 }
--- a/aten/src/THCUNN/upsampling.h
+++ b/aten/src/THCUNN/upsampling.h
@ -36,7 +36,7 @@ static Acctype linear_upsampling_compute_source_index(

 __device__ __forceinline__
 static int nearest_neighbor_compute_source_index(
-		const float scale, int dst_index, int inputSize) {
+                const float scale, int dst_index, int inputSize) {
  const int src_index = MIN(floor(dst_index * scale), inputSize - 1);
  return src_index;
 }
--- a/aten/src/THNN/generic/BCECriterion.c
+++ b/aten/src/THNN/generic/BCECriterion.c
@ -29,16 +29,16 @@ void THNN_(BCECriterion_updateOutput)(
        scalar_t y = *target_data;
        THAssertMsg(x >= 0. && x <= 1.,
          "input value should be between 0~1, but got %f",
-		      (double) x);
-		    *output_data = -(safe_log(x) * y + safe_log(1. - x) * (1. - y));
+                      (double) x);
+                    *output_data = -(safe_log(x) * y + safe_log(1. - x) * (1. - y));
    );
-		if (weights) {
+                if (weights) {
      THTensor_(cmul)(output, output, weights);
    }
    return;
  }

-	THTensor_(resize0d)(output);
+        THTensor_(resize0d)(output);
  scalar_t sum = 0;

  if (weights) {
@ -48,7 +48,7 @@ void THNN_(BCECriterion_updateOutput)(
      scalar_t w = *weights_data;
      THAssertMsg(x >= 0. && x <= 1.,
        "input value should be between 0~1, but got %f",
-		  (double) x);
+                  (double) x);
      sum -= (safe_log(x) * y + safe_log(1. - x) * (1. - y)) * w;
    );
  } else {
@ -57,7 +57,7 @@ void THNN_(BCECriterion_updateOutput)(
      scalar_t y = *target_data;
      THAssertMsg(x >= 0. && x <= 1.,
        "input value should be between 0~1, but got %f",
-		  (double) x);
+                  (double) x);
      sum -= safe_log(x) * y + safe_log(1. - x) * (1. - y);
    );
  }
--- a/aten/src/THNN/generic/ClassNLLCriterion.c
+++ b/aten/src/THNN/generic/ClassNLLCriterion.c
@ -25,7 +25,7 @@ void THNN_(ClassNLLCriterion_updateOutput)(
  if (weights && THTensor_(nElement)(weights) != n_classes) {
    THDescBuff s1 = THTensor_(sizeDesc)(weights);
    THError("weight tensor should be defined either for all %d classes or no classes"
-	    " but got weight tensor of shape: %s", n_classes, s1.str);
+            " but got weight tensor of shape: %s", n_classes, s1.str);
  }

  if (reduction == Reduction::None && n_dims == 2) {
@ -39,8 +39,8 @@ void THNN_(ClassNLLCriterion_updateOutput)(
      int cur_target = THLongTensor_fastGetLegacy1dNoScalars(target, i);

      if (cur_target == ignore_index) {
-	THTensor_(fastSet1d)(output, i, 0.0f);
-	continue;
+        THTensor_(fastSet1d)(output, i, 0.0f);
+        continue;
      }
      if (cur_target >= 0 && cur_target < n_classes) {
          scalar_t cur_weight = weights ? THTensor_(fastGetLegacy1dNoScalars)(weights, cur_target) : 1.0f;
--- a/aten/src/THNN/generic/MultiMarginCriterion.c
+++ b/aten/src/THNN/generic/MultiMarginCriterion.c
@ -40,7 +40,7 @@ void THNN_(MultiMarginCriterion_updateOutput)(
  {
    THIndex_t idx = THIndexTensor_(get1d)(target, t);
    THArgCheck((idx >= 0) && (idx < dim), 3,
-	       "target out of range");
+               "target out of range");
  }

  input = THTensor_(newContiguous)(input);
--- a/aten/src/THNN/generic/SpatialAdaptiveMaxPooling.c
+++ b/aten/src/THNN/generic/SpatialAdaptiveMaxPooling.c
@ -98,7 +98,7 @@ void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(


  THNN_ARGCHECK(!input->is_empty() && (input->dim() == 3 || input->dim() == 4), 2, input,
-		"non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s");
+                "non-empty 3D or 4D (batch mode) tensor expected for input, but got: %s");

  if (input->dim() == 4)
  {
--- a/aten/src/THNN/generic/SpatialAveragePooling.c
+++ b/aten/src/THNN/generic/SpatialAveragePooling.c
@ -6,9 +6,9 @@
 #include <algorithm>

 static inline void THNN_(SpatialAveragePooling_shapeCheck)(
-	THTensor *input, THTensor *gradOutput,
-	int kH, int kW, int dH, int dW, int padH, int padW,
-	bool ceil_mode) {
+        THTensor *input, THTensor *gradOutput,
+        int kH, int kW, int dH, int dW, int padH, int padW,
+        bool ceil_mode) {

  THArgCheck(kW > 0 && kH > 0, 5,
             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
@ -27,12 +27,12 @@ static inline void THNN_(SpatialAveragePooling_shapeCheck)(
  }

  THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
-		"non-empty 3D or 4D input tensor expected but got: %s");
+                "non-empty 3D or 4D input tensor expected but got: %s");

  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2,
-	     "pad should be smaller than half of kernel size, but got "
-	     "padW = %d, padH = %d, kW = %d, kH = %d",
-	     padW, padH, kW, kH);
+             "pad should be smaller than half of kernel size, but got "
+             "padW = %d, padH = %d, kW = %d, kH = %d",
+             padW, padH, kW, kH);

  int64_t nInputPlane = input->size(dimh-1);
  int64_t inputHeight = input->size(dimh);
@ -44,7 +44,7 @@ static inline void THNN_(SpatialAveragePooling_shapeCheck)(

  if (outputWidth < 1 || outputHeight < 1)
    THError("Given input size: (%dx%dx%d). "
-	    "Calculated output size: (%dx%dx%d). Output size is too small",
+            "Calculated output size: (%dx%dx%d). Output size is too small",
            nInputPlane,inputHeight,inputWidth,nInputPlane,outputHeight,outputWidth);

  if (gradOutput != NULL) {
--- a/aten/src/THNN/generic/SpatialClassNLLCriterion.c
+++ b/aten/src/THNN/generic/SpatialClassNLLCriterion.c
@ -4,12 +4,12 @@

 #define INITIAL_CHECK                                                            \
  THArgCheck(THIndexTensor_(nDimensionLegacyAll)(target) == 3, 3,                         \
-    "only batches of spatial targets supported (3D tensors)"		         \
-	     " but got targets of dimension: %d",			         \
-	     THIndexTensor_(nDimensionLegacyAll)(target));			         \
-  THArgCheck(THTensor_(nDimensionLegacyAll)(input) == 4, 2,			         \
-	     "only batches of spatial inputs supported (4D tensors), "	         \
-	     "but got input of dimension: %d", THTensor_(nDimensionLegacyAll)(input));    \
+    "only batches of spatial targets supported (3D tensors)"                         \
+             " but got targets of dimension: %d",                                 \
+             THIndexTensor_(nDimensionLegacyAll)(target));                                 \
+  THArgCheck(THTensor_(nDimensionLegacyAll)(input) == 4, 2,                                 \
+             "only batches of spatial inputs supported (4D tensors), "                 \
+             "but got input of dimension: %d", THTensor_(nDimensionLegacyAll)(input));    \
  if (weights && THTensor_(nElement)(weights) != THTensor_(size)(input, 1)) {    \
    THError("weight tensor should be defined either for all or no classes");     \
  }                                                                              \
@ -30,8 +30,8 @@
 #define GRADOUTPUT_SHAPE_CHECK                                                \
  THArgCheck(THTensor_(nDimensionLegacyAll)(gradOutput) == 3, 3,                       \
    "gradOutput must have same dimension as target (3)"                       \
-	     " but got dimension: %d",			                                        \
-	     THTensor_(nDimensionLegacyAll)(gradOutput));			                              \
+             " but got dimension: %d",                                                                \
+             THTensor_(nDimensionLegacyAll)(gradOutput));                                                      \
  {                                                                           \
    int64_t gradOutput0 = THTensor_(size)(gradOutput, 0);                     \
    int64_t gradOutput1 = THTensor_(size)(gradOutput, 1);                     \
--- a/aten/src/THNN/generic/SpatialConvolutionMM.c
+++ b/aten/src/THNN/generic/SpatialConvolutionMM.c
@ -5,14 +5,14 @@
 #include <ATen/div_rtn.h>

 static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
-	THTensor *input, THTensor *gradOutput,
-	THTensor *weight, THTensor *bias,
-	int kH, int kW, int dH, int dW, int padH, int padW, int weight_nullable) {
+        THTensor *input, THTensor *gradOutput,
+        THTensor *weight, THTensor *bias,
+        int kH, int kW, int dH, int dW, int padH, int padW, int weight_nullable) {

  THArgCheck(kW > 0 && kH > 0, 9,
-	       "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+               "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
  THArgCheck(dW > 0 && dH > 0, 11,
-	     "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+             "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);

  if (weight != NULL) {
    THNN_ARGCHECK(!weight->is_empty() && (weight->dim() == 2 || weight->dim() == 4), 5, weight,
@ -36,7 +36,7 @@ static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
  }

  THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
-		"non-empty 3D or 4D input tensor expected but got: %s");
+                "non-empty 3D or 4D input tensor expected but got: %s");

  int64_t inputHeight  = input->size(dimh);
  int64_t inputWidth   = input->size(dimw);
@ -87,8 +87,8 @@ static THTensor* THNN_(newViewWeightMM2d)(THTensor *weight) {
    int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3);
    THTensor *old_weight = weight;
    weight = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(weight), weight->storage_offset(),
-					 s1, -1, s2, -1);
-	c10::raw::intrusive_ptr::decref(old_weight);
+                                         s1, -1, s2, -1);
+        c10::raw::intrusive_ptr::decref(old_weight);
  }
  return weight;
 }
@ -116,8 +116,8 @@ static void THNN_(SpatialConvolutionMM_updateOutput_frame)(
  THTensor *output2d;

  THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH,
-		       nInputPlane, inputWidth, inputHeight,
-		       outputWidth, outputHeight);
+                       nInputPlane, inputWidth, inputHeight,
+                       outputWidth, outputHeight);

  output2d = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(output), output->storage_offset(),
                                         nOutputPlane, -1,
@ -125,8 +125,8 @@ static void THNN_(SpatialConvolutionMM_updateOutput_frame)(
  if (bias) {
    for(i = 0; i < nOutputPlane; i++)
        THVector_(fill)
-	  (THStorage_(data)(THTensor_getStoragePtr(output)) + output->storage_offset() + output->stride(0) * i,
-	   THTensor_(get1d)(bias, i), outputHeight*outputWidth);
+          (THStorage_(data)(THTensor_getStoragePtr(output)) + output->storage_offset() + output->stride(0) * i,
+           THTensor_(get1d)(bias, i), outputHeight*outputWidth);
  } else {
    THTensor_(zero)(output);
  }
@ -202,10 +202,10 @@ void THNN_(SpatialConvolutionMM_updateOutput)(
      THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);

      THNN_(SpatialConvolutionMM_updateOutput_frame)
-	(input_t, output_t, weight, bias, finput_t,
-	 kW, kH, dW, dH, padW, padH,
-	 nInputPlane, inputWidth, inputHeight,
-	 nOutputPlane, outputWidth, outputHeight);
+        (input_t, output_t, weight, bias, finput_t,
+         kW, kH, dW, dH, padW, padH,
+         nInputPlane, inputWidth, inputHeight,
+         nOutputPlane, outputWidth, outputHeight);

      c10::raw::intrusive_ptr::decref(input_t);
      c10::raw::intrusive_ptr::decref(output_t);
@ -239,9 +239,9 @@ static void THNN_(SpatialConvolutionMM_updateGradInput_frame)(
  THTensor_(zero)(gradInput);

  THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH,
-		      padW, padH,
-		      gradInput->size(0), gradInput->size(2), gradInput->size(1),
-		      gradOutput->size(2), gradOutput->size(1));
+                      padW, padH,
+                      gradInput->size(0), gradInput->size(2), gradInput->size(1),
+                      gradOutput->size(2), gradOutput->size(1));
 }

 void THNN_(SpatialConvolutionMM_updateGradInput)(
@ -280,8 +280,8 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
  if(input->dim() == 3)
  {
    THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput, gradOutput,
-						      tweight, fgradInput,
-						      kW, kH, dW, dH, padW, padH);
+                                                      tweight, fgradInput,
+                                                      kW, kH, dW, dH, padW, padH);
  }
  else
  {
@ -296,8 +296,8 @@ void THNN_(SpatialConvolutionMM_updateGradInput)(
      THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);

      THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput_t, gradOutput_t,
-							tweight, fgradInput_t,
-							kW, kH, dW, dH, padW, padH);
+                                                        tweight, fgradInput_t,
+                                                        kW, kH, dW, dH, padW, padH);

      c10::raw::intrusive_ptr::decref(gradInput_t);
      c10::raw::intrusive_ptr::decref(gradOutput_t);
@ -380,7 +380,7 @@ void THNN_(SpatialConvolutionMM_accGradParameters)(
  if(input->dim() == 3)
  {
    THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight,
-							gradBias, finput, scale);
+                                                        gradBias, finput, scale);
  }
  else
  {
@ -396,7 +396,7 @@ void THNN_(SpatialConvolutionMM_accGradParameters)(
      }

      THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight,
-							  gradBias, finput_t, scale);
+                                                          gradBias, finput_t, scale);

      c10::raw::intrusive_ptr::decref(gradOutput_t);
      if (gradWeight) {
--- a/aten/src/THNN/generic/SpatialDilatedConvolution.c
+++ b/aten/src/THNN/generic/SpatialDilatedConvolution.c
@ -5,10 +5,10 @@
 #include <ATen/div_rtn.h>

 static inline void THNN_(SpatialDilatedConvolution_shapeCheck)(
-	THTensor *input, THTensor *gradOutput,
-	THTensor *weight, THTensor *bias,
-	int kH, int kW, int dH, int dW, int padH, int padW,
-	int dilationH, int dilationW, int weight_nullable) {
+        THTensor *input, THTensor *gradOutput,
+        THTensor *weight, THTensor *bias,
+        int kH, int kW, int dH, int dW, int padH, int padW,
+        int dilationH, int dilationW, int weight_nullable) {
  THArgCheck(kW > 0 && kH > 0, 9,
             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
  THArgCheck(dW > 0 && dH > 0, 11,
@ -40,7 +40,7 @@ static inline void THNN_(SpatialDilatedConvolution_shapeCheck)(
  }

  THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
-		"non-empty 3D or 4D input tensor expected but got: %s");
+                "non-empty 3D or 4D input tensor expected but got: %s");

  int64_t inputHeight  = input->size(dimh);
  int64_t inputWidth   = input->size(dimw);
@ -235,7 +235,7 @@ void THNN_(SpatialDilatedConvolution_updateGradInput)(
    is_batch = 0;
    THTensor_(resize4d)(input, 1, input->size(0), input->size(1), input->size(2));
    THTensor_(resize4d)(gradOutput, 1, gradOutput->size(0), gradOutput->size(1),
-			gradOutput->size(2));
+                        gradOutput->size(2));
  }

  int64_t inputWidth   = input->size(3);
@ -342,7 +342,7 @@ void THNN_(SpatialDilatedConvolution_accGradParameters)(
    is_batch = 0;
    THTensor_(resize4d)(input, 1, input->size(0), input->size(1), input->size(2));
    THTensor_(resize4d)(gradOutput, 1, gradOutput->size(0),
-			gradOutput->size(1), gradOutput->size(2));
+                        gradOutput->size(1), gradOutput->size(2));
  }

  int64_t nInputPlane = input->size(1);
--- a/aten/src/THNN/generic/SpatialDilatedMaxPooling.c
+++ b/aten/src/THNN/generic/SpatialDilatedMaxPooling.c
@ -6,9 +6,9 @@
 #include <algorithm>

 static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)(
-	THTensor *input, THTensor *gradOutput, THIndexTensor *indices,
-	int kH, int kW, int dH, int dW, int padH, int padW,
-	int dilationH, int dilationW, bool ceil_mode) {
+        THTensor *input, THTensor *gradOutput, THIndexTensor *indices,
+        int kH, int kW, int dH, int dW, int padH, int padW,
+        int dilationH, int dilationW, bool ceil_mode) {

  THArgCheck(kW > 0 && kH > 0, 5,
             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
@ -30,12 +30,12 @@ static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)(
  }

  THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
-		"non-empty 3D or 4D input tensor expected but got: %s");
+                "non-empty 3D or 4D input tensor expected but got: %s");

  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2,
-	     "pad should be smaller than half of kernel size, but got "
-	     "padW = %d, padH = %d, kW = %d, kH = %d",
-	     padW, padH, kW, kH);
+             "pad should be smaller than half of kernel size, but got "
+             "padW = %d, padH = %d, kW = %d, kH = %d",
+             padW, padH, kW, kH);

  int64_t nInputPlane = input->size(dimh-1);
  int64_t inputHeight = input->size(dimh);
@ -47,7 +47,7 @@ static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)(

  if (outputWidth < 1 || outputHeight < 1)
    THError("Given input size: (%dx%dx%d). "
-	    "Calculated output size: (%dx%dx%d). Output size is too small",
+            "Calculated output size: (%dx%dx%d). Output size is too small",
            nInputPlane,inputHeight,inputWidth,nInputPlane,outputHeight,outputWidth);

  if (gradOutput != NULL) {
@ -221,16 +221,16 @@ void THNN_(SpatialDilatedMaxPooling_updateOutput)(
    for (p = 0; p < nbatch; p++)
    {
      THNN_(SpatialDilatedMaxPooling_updateOutput_frame)
-	(input_data+p*nInputPlane*inputWidth*inputHeight,
-	 output_data+p*nInputPlane*outputWidth*outputHeight,
-	 indices_data+p*nInputPlane*outputWidth*outputHeight,
-	 nInputPlane,
-	 inputWidth, inputHeight,
-	 outputWidth, outputHeight,
-	 kW, kH, dW, dH,
-	 padW, padH,
-	 dilationW, dilationH
-	 );
+        (input_data+p*nInputPlane*inputWidth*inputHeight,
+         output_data+p*nInputPlane*outputWidth*outputHeight,
+         indices_data+p*nInputPlane*outputWidth*outputHeight,
+         nInputPlane,
+         inputWidth, inputHeight,
+         outputWidth, outputHeight,
+         kW, kH, dW, dH,
+         padW, padH,
+         dilationW, dilationH
+         );
    }
  }

@ -266,10 +266,10 @@ static void THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)(
      {
        /* retrieve position of max */
        int64_t maxp = ind_p_k[i*outputWidth + j];
-	if (maxp != -1) {
-	  /* update gradient */
-	  gradInput_p_k[maxp] += gradOutput_p_k[i*outputWidth + j];
-	}
+        if (maxp != -1) {
+          /* update gradient */
+          gradInput_p_k[maxp] += gradOutput_p_k[i*outputWidth + j];
+        }
      }
    }
  }
@ -350,13 +350,13 @@ void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
    for (p = 0; p < nbatch; p++)
    {
      THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)
-	(gradInput_data+p*nInputPlane*inputWidth*inputHeight,
-	 gradOutput_data+p*nInputPlane*outputWidth*outputHeight,
-	 indices_data+p*nInputPlane*outputWidth*outputHeight,
-	 nInputPlane,
-	 inputWidth, inputHeight,
-	 outputWidth, outputHeight,
-	 dW, dH);
+        (gradInput_data+p*nInputPlane*inputWidth*inputHeight,
+         gradOutput_data+p*nInputPlane*outputWidth*outputHeight,
+         indices_data+p*nInputPlane*outputWidth*outputHeight,
+         nInputPlane,
+         inputWidth, inputHeight,
+         outputWidth, outputHeight,
+         dW, dH);
    }
  }

--- a/aten/src/THNN/generic/SpatialFullDilatedConvolution.c
+++ b/aten/src/THNN/generic/SpatialFullDilatedConvolution.c
@ -3,15 +3,15 @@
 #else

 static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)(
-	THTensor *input, THTensor *gradOutput,
-	THTensor *weight, THTensor *bias,
-	int kH, int kW, int dH, int dW, int padH, int padW,
-	int dilationH, int dilationW, int adjH, int adjW, int weight_nullable) {
+        THTensor *input, THTensor *gradOutput,
+        THTensor *weight, THTensor *bias,
+        int kH, int kW, int dH, int dW, int padH, int padW,
+        int dilationH, int dilationW, int adjH, int adjW, int weight_nullable) {

  THArgCheck(kW > 0 && kH > 0, 9,
             "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
  THArgCheck(dW > 0 && dH > 0, 11,
-	     "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+             "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
  THArgCheck(dilationW > 0 && dilationH > 0, 15,
             "dilation should be greater than zero, but got dilationH: %d, dilationW: %d",
             dilationH, dilationW);
@ -41,7 +41,7 @@ static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)(
  }

  THNN_ARGCHECK(!input->is_empty() && (ndim == 3 || ndim == 4), 2, input,
-		"non-empty 3D or 4D input tensor expected but got: %s");
+                "non-empty 3D or 4D input tensor expected but got: %s");

  int64_t inputHeight  = input->size(dimh);
  int64_t inputWidth   = input->size(dimw);
@ -50,8 +50,8 @@ static inline void THNN_(SpatialFullDilatedConvolution_shapeCheck)(

  if (outputWidth < 1 || outputHeight < 1) {
    THError("Given input size per channel: (%ld x %ld). "
-	    "Calculated output size per channel: (%ld x %ld). Output size is too small",
-	    inputHeight, inputWidth, outputHeight, outputWidth);
+            "Calculated output size per channel: (%ld x %ld). Output size is too small",
+            inputHeight, inputWidth, outputHeight, outputWidth);
  }

  if (weight != NULL) {
--- a/aten/src/THNN/generic/SpatialMaxUnpooling.c
+++ b/aten/src/THNN/generic/SpatialMaxUnpooling.c
@ -111,12 +111,12 @@ void THNN_(SpatialMaxUnpooling_updateOutput)(
    for (p = 0; p < nbatch; p++)
    {
      THNN_(SpatialMaxUnpooling_updateOutput_frame)(
-						    input_data+p*nslices*iwidth*iheight,
-						    output_data+p*nslices*owidth*oheight,
-						    indices_data+p*nslices*iwidth*iheight,
-						    nslices,
-						    iwidth, iheight,
-						    owidth, oheight);
+                                                    input_data+p*nslices*iwidth*iheight,
+                                                    output_data+p*nslices*owidth*oheight,
+                                                    indices_data+p*nslices*iwidth*iheight,
+                                                    nslices,
+                                                    iwidth, iheight,
+                                                    owidth, oheight);
    }
  }

@ -196,7 +196,7 @@ void THNN_(SpatialMaxUnpooling_updateGradInput)(

  if(owidth!=gradOutput->size(dimw) || oheight!=gradOutput->size(dimh)){
    THError("Inconsistent gradOutput size. oheight= %d, owidth= %d, gradOutput: %dx%d",
-	    oheight, owidth, gradOutput->size(dimh), gradOutput->size(dimw));
+            oheight, owidth, gradOutput->size(dimh), gradOutput->size(dimw));
  }

  /* get raw pointers */
--- a/aten/src/THNN/generic/TemporalRowConvolution.c
+++ b/aten/src/THNN/generic/TemporalRowConvolution.c
@ -3,467 +3,467 @@
 #else

 static inline void THNN_(TemporalRowConvolution_shapeCheck)(
-	THNNState *state,
-	THTensor *input,
-	THTensor *gradOutput,
-	THTensor *weight,
-	THTensor *bias,
-	int kW,
-	int dW,
-	int padW) {
+        THNNState *state,
+        THTensor *input,
+        THTensor *gradOutput,
+        THTensor *weight,
+        THTensor *bias,
+        int kW,
+        int dW,
+        int padW) {

-	THArgCheck(kW > 0, 5,
-	           "kernel size should be greater than zero, but got kW: %d", kW);
-	THArgCheck(dW > 0, 6,
-	           "stride should be greater than zero, but got dW: %d", dW);
-	THNN_ARGCHECK(!weight->is_empty() && weight->dim() == 3, 3, weight,
-	              "non-empty 3D weight tensor expected, but got: %s");
+        THArgCheck(kW > 0, 5,
+                   "kernel size should be greater than zero, but got kW: %d", kW);
+        THArgCheck(dW > 0, 6,
+                   "stride should be greater than zero, but got dW: %d", dW);
+        THNN_ARGCHECK(!weight->is_empty() && weight->dim() == 3, 3, weight,
+                      "non-empty 3D weight tensor expected, but got: %s");
    THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
    THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous");

-	if (bias != NULL) {
-		THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size(0));
-	}
+        if (bias != NULL) {
+                THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size(0));
+        }

-	// we're always looking at (possibly batch) x feats x seq
-	int ndim = input->dim();
-	int dimF = 0;
-	int dimS = 1;
+        // we're always looking at (possibly batch) x feats x seq
+        int ndim = input->dim();
+        int dimF = 0;
+        int dimS = 1;

-	if (ndim == 3) {
-		++dimS;
-		++dimF;
-	}
+        if (ndim == 3) {
+                ++dimS;
+                ++dimF;
+        }

-	THNN_ARGCHECK(!input->is_empty() && (ndim == 2 || ndim == 3), 1, input,
-	              "non-empty 2D or 3D (batch mode) input tensor expected, but got :%s");
+        THNN_ARGCHECK(!input->is_empty() && (ndim == 2 || ndim == 3), 1, input,
+                      "non-empty 2D or 3D (batch mode) input tensor expected, but got :%s");

-	int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0);
-	int64_t nInputFrame = input->size(dimS);
-	int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
+        int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0);
+        int64_t nInputFrame = input->size(dimS);
+        int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;

-	if (nOutputFrame < 1) {
-		THError("Given input size: (%d x %d). "
-		        "Calculated output size: (%d x %d). Output size is too small",
-		        inputFrameSize, nInputFrame, inputFrameSize, nOutputFrame);
-	}
+        if (nOutputFrame < 1) {
+                THError("Given input size: (%d x %d). "
+                        "Calculated output size: (%d x %d). Output size is too small",
+                        inputFrameSize, nInputFrame, inputFrameSize, nOutputFrame);
+        }

-	THNN_CHECK_DIM_SIZE(input, ndim, dimF, inputFrameSize);
+        THNN_CHECK_DIM_SIZE(input, ndim, dimF, inputFrameSize);

-	if (gradOutput != NULL) {
-		THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimF, inputFrameSize);
-		THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimS, nOutputFrame);
-	}
+        if (gradOutput != NULL) {
+                THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimF, inputFrameSize);
+                THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimS, nOutputFrame);
+        }
 }

 static void THNN_(unfolded_acc_row)(
-	THTensor *finput,
-	THTensor *input,
-	int kW,
-	int dW,
-	int padW,
-	int64_t inputFrameSize,
-	int64_t nInputFrame,
-	int64_t nOutputFrame) {
+        THTensor *finput,
+        THTensor *input,
+        int kW,
+        int dW,
+        int padW,
+        int64_t inputFrameSize,
+        int64_t nInputFrame,
+        int64_t nOutputFrame) {

-	int64_t c;
-	scalar_t *input_data = input->data<scalar_t>();
-	scalar_t *finput_data = finput->data<scalar_t>();
+        int64_t c;
+        scalar_t *input_data = input->data<scalar_t>();
+        scalar_t *finput_data = finput->data<scalar_t>();

 // #pragma omp parallel for private(c)
-	for (c = 0; c < inputFrameSize; c++) {
-		int64_t kw, x;
-		int64_t ix = 0;
+        for (c = 0; c < inputFrameSize; c++) {
+                int64_t kw, x;
+                int64_t ix = 0;

-		for (kw = 0; kw < kW; kw++) {
-			scalar_t *src = finput_data
-			            + c * (kW * nOutputFrame)
-			            + kw * (nOutputFrame);
-			scalar_t *dst = input_data + c * (nInputFrame);
+                for (kw = 0; kw < kW; kw++) {
+                        scalar_t *src = finput_data
+                                    + c * (kW * nOutputFrame)
+                                    + kw * (nOutputFrame);
+                        scalar_t *dst = input_data + c * (nInputFrame);

-			ix = (size_t)(kw);
-			if (dW == 1) {
-			  scalar_t *dst_slice = dst + (size_t)(ix);
-			  THVector_(cadd)(dst_slice, dst_slice, src, 1, nOutputFrame);
-			} else {
-				for (x = 0; x < nOutputFrame; x++) {
-				  scalar_t *dst_slice = dst + (size_t)(ix + x * dW);
-				  THVector_(cadd)(dst_slice, dst_slice,
-						  src + (size_t)(x), 1, 1);
-				}
-			}
-		}
-	}
+                        ix = (size_t)(kw);
+                        if (dW == 1) {
+                          scalar_t *dst_slice = dst + (size_t)(ix);
+                          THVector_(cadd)(dst_slice, dst_slice, src, 1, nOutputFrame);
+                        } else {
+                                for (x = 0; x < nOutputFrame; x++) {
+                                  scalar_t *dst_slice = dst + (size_t)(ix + x * dW);
+                                  THVector_(cadd)(dst_slice, dst_slice,
+                                                  src + (size_t)(x), 1, 1);
+                                }
+                        }
+                }
+        }
 }

 static void THNN_(unfolded_copy_row)(
-	THTensor *finput,
-	THTensor *input,
-	int kW,
-	int dW,
-	int padW,
-	int64_t inputFrameSize,
-	int64_t nInputFrame,
-	int64_t nOutputFrame) {
+        THTensor *finput,
+        THTensor *input,
+        int kW,
+        int dW,
+        int padW,
+        int64_t inputFrameSize,
+        int64_t nInputFrame,
+        int64_t nOutputFrame) {

-	int64_t k;
-	scalar_t *input_data = input->data<scalar_t>();
-	scalar_t *finput_data = finput->data<scalar_t>();
+        int64_t k;
+        scalar_t *input_data = input->data<scalar_t>();
+        scalar_t *finput_data = finput->data<scalar_t>();

 // #pragma omp parallel for private(k)
-	for (k = 0; k < inputFrameSize * kW; k++) {
-		int64_t c = k / kW;
-		int64_t rest = k % kW;
-		int64_t kw = rest % kW;
-		int64_t x;
-		int64_t ix;
-		scalar_t *dst = finput_data + c * (kW * nOutputFrame) + kw * (nOutputFrame);
-		scalar_t *src = input_data + c * (nInputFrame);
+        for (k = 0; k < inputFrameSize * kW; k++) {
+                int64_t c = k / kW;
+                int64_t rest = k % kW;
+                int64_t kw = rest % kW;
+                int64_t x;
+                int64_t ix;
+                scalar_t *dst = finput_data + c * (kW * nOutputFrame) + kw * (nOutputFrame);
+                scalar_t *src = input_data + c * (nInputFrame);

-		ix = (size_t)(kw);
-		if (dW == 1) {
-			memcpy(dst, src+(size_t)(ix), sizeof(scalar_t) * (nOutputFrame));
-		} else {
-			for (x = 0; x < nOutputFrame; x++) {
-				memcpy(dst + (size_t)(x), src + (size_t)(ix + x * dW),
-				       sizeof(scalar_t) * 1);
-			}
-		}
-	}
+                ix = (size_t)(kw);
+                if (dW == 1) {
+                        memcpy(dst, src+(size_t)(ix), sizeof(scalar_t) * (nOutputFrame));
+                } else {
+                        for (x = 0; x < nOutputFrame; x++) {
+                                memcpy(dst + (size_t)(x), src + (size_t)(ix + x * dW),
+                                       sizeof(scalar_t) * 1);
+                        }
+                }
+        }
 }

 static void THNN_(TemporalRowConvolution_updateOutput_frame)(
-	THTensor *input,
-	THTensor *output,
-	THTensor *weight,
-	THTensor *bias,
-	THTensor *finput,
-	int kW,
-	int dW,
-	int padW,
-	int64_t inputFrameSize,
-	int64_t nInputFrame,
-	int64_t nOutputFrame) {
+        THTensor *input,
+        THTensor *output,
+        THTensor *weight,
+        THTensor *bias,
+        THTensor *finput,
+        int kW,
+        int dW,
+        int padW,
+        int64_t inputFrameSize,
+        int64_t nInputFrame,
+        int64_t nOutputFrame) {

-	int64_t i;
+        int64_t i;

-	THTensor *output3d = THTensor_(newWithStorage3d)(
-		THTensor_getStoragePtr(output), output->storage_offset(),
-		inputFrameSize, -1,
-		1, -1,
-		nOutputFrame, -1);
+        THTensor *output3d = THTensor_(newWithStorage3d)(
+                THTensor_getStoragePtr(output), output->storage_offset(),
+                inputFrameSize, -1,
+                1, -1,
+                nOutputFrame, -1);

-	THNN_(unfolded_copy_row)(finput, input, kW, dW, padW,
-	                         inputFrameSize, nInputFrame, nOutputFrame);
+        THNN_(unfolded_copy_row)(finput, input, kW, dW, padW,
+                                 inputFrameSize, nInputFrame, nOutputFrame);

-	THTensor_(zero)(output);
+        THTensor_(zero)(output);

-	if (bias != NULL) {
-		for (i = 0; i < inputFrameSize; i++)
-			THVector_(fill)
-			        (THStorage_(data)(THTensor_getStoragePtr(output)) + output->storage_offset()
-			        + output->stride(0) * i,
-			        THTensor_(get1d)(bias, i), nOutputFrame);
-	}
+        if (bias != NULL) {
+                for (i = 0; i < inputFrameSize; i++)
+                        THVector_(fill)
+                                (THStorage_(data)(THTensor_getStoragePtr(output)) + output->storage_offset()
+                                + output->stride(0) * i,
+                                THTensor_(get1d)(bias, i), nOutputFrame);
+        }

-	THTensor_(baddbmm)(output3d, 1, output3d, 1, weight, finput);
+        THTensor_(baddbmm)(output3d, 1, output3d, 1, weight, finput);

-	c10::raw::intrusive_ptr::decref(output3d);
+        c10::raw::intrusive_ptr::decref(output3d);
 }

 void THNN_(TemporalRowConvolution_updateOutput)(
-	THNNState *state,
-	THTensor *input,
-	THTensor *output,
-	THTensor *weight,
-	THTensor *bias,
-	THTensor *finput,
-	THTensor *fgradInput,     // unused here but needed for Cuda
-	int kW,
-	int dW,
-	int padW,
-	bool featFirst) {
+        THNNState *state,
+        THTensor *input,
+        THTensor *output,
+        THTensor *weight,
+        THTensor *bias,
+        THTensor *finput,
+        THTensor *fgradInput,     // unused here but needed for Cuda
+        int kW,
+        int dW,
+        int padW,
+        bool featFirst) {

-	int ndim = input->dim();
+        int ndim = input->dim();

-	THTensor *tinput = NULL;
-	if (!featFirst) {
-		tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
-		input = THTensor_(newContiguous)(tinput);
-	} else {
-		input = THTensor_(newContiguous)(input);
-	}
+        THTensor *tinput = NULL;
+        if (!featFirst) {
+                tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
+                input = THTensor_(newContiguous)(tinput);
+        } else {
+                input = THTensor_(newContiguous)(input);
+        }

-	THNN_(TemporalRowConvolution_shapeCheck)(
-		state, input, NULL, weight, bias, kW, dW, padW);
+        THNN_(TemporalRowConvolution_shapeCheck)(
+                state, input, NULL, weight, bias, kW, dW, padW);

-	int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0);
-	int64_t nInputFrame = input->size(ndim - 1);
-	int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
+        int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0);
+        int64_t nInputFrame = input->size(ndim - 1);
+        int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;

-	if (ndim == 2) { /* non-batch mode */
+        if (ndim == 2) { /* non-batch mode */

-		THTensor_(resize3d)(finput, inputFrameSize, kW, nOutputFrame);
-		THTensor_(resize2d)(output, inputFrameSize, nOutputFrame);
+                THTensor_(resize3d)(finput, inputFrameSize, kW, nOutputFrame);
+                THTensor_(resize2d)(output, inputFrameSize, nOutputFrame);

-		THTensor_(zero)(finput);
-		THTensor_(zero)(output);
+                THTensor_(zero)(finput);
+                THTensor_(zero)(output);

-		THNN_(TemporalRowConvolution_updateOutput_frame)
-		        (input, output, weight, bias, finput,
-		        kW, dW, padW,
-		        inputFrameSize, nInputFrame, nOutputFrame);
+                THNN_(TemporalRowConvolution_updateOutput_frame)
+                        (input, output, weight, bias, finput,
+                        kW, dW, padW,
+                        inputFrameSize, nInputFrame, nOutputFrame);

-	} else {
-		int64_t T = input->size(0);
-		int64_t t;
+        } else {
+                int64_t T = input->size(0);
+                int64_t t;

-		THTensor_(resize4d)(finput, T, inputFrameSize, kW, nOutputFrame);
-		THTensor_(resize3d)(output, T, inputFrameSize, nOutputFrame);
+                THTensor_(resize4d)(finput, T, inputFrameSize, kW, nOutputFrame);
+                THTensor_(resize3d)(output, T, inputFrameSize, nOutputFrame);

-		THTensor_(zero)(finput);
-		THTensor_(zero)(output);
+                THTensor_(zero)(finput);
+                THTensor_(zero)(output);

 #pragma omp parallel for private(t)
-		for (t = 0; t < T; t++) {
-			THTensor *input_t = THTensor_(newSelect)(input, 0, t);
-			THTensor *output_t = THTensor_(newSelect)(output, 0, t);
-			THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+                for (t = 0; t < T; t++) {
+                        THTensor *input_t = THTensor_(newSelect)(input, 0, t);
+                        THTensor *output_t = THTensor_(newSelect)(output, 0, t);
+                        THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);

-			THNN_(TemporalRowConvolution_updateOutput_frame)
-			        (input_t, output_t, weight, bias, finput_t,
-			        kW, dW, padW, inputFrameSize, nInputFrame, nOutputFrame);
+                        THNN_(TemporalRowConvolution_updateOutput_frame)
+                                (input_t, output_t, weight, bias, finput_t,
+                                kW, dW, padW, inputFrameSize, nInputFrame, nOutputFrame);

-			c10::raw::intrusive_ptr::decref(input_t);
-			c10::raw::intrusive_ptr::decref(output_t);
-			c10::raw::intrusive_ptr::decref(finput_t);
-		}
-	}
+                        c10::raw::intrusive_ptr::decref(input_t);
+                        c10::raw::intrusive_ptr::decref(output_t);
+                        c10::raw::intrusive_ptr::decref(finput_t);
+                }
+        }

-	if (!featFirst) { // NOTE: output will NOT be contiguous in this case
-		THTensor_(transpose)(output, output, ndim - 1, ndim - 2);
-		c10::raw::intrusive_ptr::decref(tinput);
-	}
+        if (!featFirst) { // NOTE: output will NOT be contiguous in this case
+                THTensor_(transpose)(output, output, ndim - 1, ndim - 2);
+                c10::raw::intrusive_ptr::decref(tinput);
+        }

-	c10::raw::intrusive_ptr::decref(input);
+        c10::raw::intrusive_ptr::decref(input);
 }

 static void THNN_(TemporalRowConvolution_updateGradInput_frame)(
-	THTensor *gradInput,
-	THTensor *gradOutput,
-	THTensor *weight,
-	THTensor *fgradInput,
-	int kW,
-	int dW,
-	int padW,
-	int64_t inputFrameSize,
-	int64_t nInputFrame,
-	int64_t nOutputFrame) {
+        THTensor *gradInput,
+        THTensor *gradOutput,
+        THTensor *weight,
+        THTensor *fgradInput,
+        int kW,
+        int dW,
+        int padW,
+        int64_t inputFrameSize,
+        int64_t nInputFrame,
+        int64_t nOutputFrame) {

-	THTensor *gradOutput3d = THTensor_(newWithStorage3d)(
-		THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(),
-		inputFrameSize, -1,
-		1, -1,
-		nOutputFrame, -1);
+        THTensor *gradOutput3d = THTensor_(newWithStorage3d)(
+                THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(),
+                inputFrameSize, -1,
+                1, -1,
+                nOutputFrame, -1);

-	// weight:			inputFrameSize x kW x 1
-	// gradOutput3d:	inputFrameSize x 1 x nOutputFrame
-	THTensor_(baddbmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput3d);
-	// fgradInput:		inputFrameSize x kW x nOutputFrame
-	c10::raw::intrusive_ptr::decref(gradOutput3d);
+        // weight:                        inputFrameSize x kW x 1
+        // gradOutput3d:        inputFrameSize x 1 x nOutputFrame
+        THTensor_(baddbmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput3d);
+        // fgradInput:                inputFrameSize x kW x nOutputFrame
+        c10::raw::intrusive_ptr::decref(gradOutput3d);

-	THTensor_(zero)(gradInput);
+        THTensor_(zero)(gradInput);

-	THNN_(unfolded_acc_row)(fgradInput, gradInput,
-	                        kW, dW, padW,
-	                        inputFrameSize, nInputFrame, nOutputFrame);
+        THNN_(unfolded_acc_row)(fgradInput, gradInput,
+                                kW, dW, padW,
+                                inputFrameSize, nInputFrame, nOutputFrame);
 }

 void THNN_(TemporalRowConvolution_updateGradInput)(
-	THNNState *state,
-	THTensor *input,
-	THTensor *gradOutput,
-	THTensor *gradInput,
-	THTensor *weight,
-	THTensor *finput,
-	THTensor *fgradInput,
-	int kW,
-	int dW,
-	int padW,
-	bool featFirst) {
+        THNNState *state,
+        THTensor *input,
+        THTensor *gradOutput,
+        THTensor *gradInput,
+        THTensor *weight,
+        THTensor *finput,
+        THTensor *fgradInput,
+        int kW,
+        int dW,
+        int padW,
+        bool featFirst) {

-	int ndim = input->dim();
+        int ndim = input->dim();

-	THTensor *tinput, *tgradOutput;
+        THTensor *tinput, *tgradOutput;

-	if (!featFirst) {
-		tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
-		tgradOutput = THTensor_(newTranspose)(gradOutput, ndim - 1, ndim - 2);
+        if (!featFirst) {
+                tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
+                tgradOutput = THTensor_(newTranspose)(gradOutput, ndim - 1, ndim - 2);

-		input = THTensor_(newContiguous)(tinput);
-		gradOutput = THTensor_(newContiguous)(tgradOutput);
+                input = THTensor_(newContiguous)(tinput);
+                gradOutput = THTensor_(newContiguous)(tgradOutput);

-	} else {
-		input = THTensor_(newContiguous)(input);
-		gradOutput = THTensor_(newContiguous)(gradOutput);
-	}
+        } else {
+                input = THTensor_(newContiguous)(input);
+                gradOutput = THTensor_(newContiguous)(gradOutput);
+        }

-	THNN_(TemporalRowConvolution_shapeCheck)(state, input, gradOutput, weight,
-	                                         NULL, kW, dW, padW);
+        THNN_(TemporalRowConvolution_shapeCheck)(state, input, gradOutput, weight,
+                                                 NULL, kW, dW, padW);

-	int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0);
-	int64_t nInputFrame = input->size(ndim - 1);
-	int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
+        int64_t inputFrameSize = THTensor_sizeLegacyNoScalars(weight, 0);
+        int64_t nInputFrame = input->size(ndim - 1);
+        int64_t nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;

-	THTensor_(resizeAs)(fgradInput, finput);
-	THTensor_(resizeAs)(gradInput, input);
+        THTensor_(resizeAs)(fgradInput, finput);
+        THTensor_(resizeAs)(gradInput, input);

-	THTensor_(zero)(fgradInput);
-	THTensor_(zero)(gradInput);
+        THTensor_(zero)(fgradInput);
+        THTensor_(zero)(gradInput);

    THTensor *tweight = THTensor_(new)();
    THTensor_(transpose)(tweight, weight, 1, 2);

-	if (ndim == 2) {
-		THNN_(TemporalRowConvolution_updateGradInput_frame)
-		        (gradInput, gradOutput, tweight, fgradInput,
-		        kW, dW, padW,
-		        inputFrameSize, nInputFrame, nOutputFrame);
-	} else {
-		int64_t T = input->size(0);
-		int64_t t;
+        if (ndim == 2) {
+                THNN_(TemporalRowConvolution_updateGradInput_frame)
+                        (gradInput, gradOutput, tweight, fgradInput,
+                        kW, dW, padW,
+                        inputFrameSize, nInputFrame, nOutputFrame);
+        } else {
+                int64_t T = input->size(0);
+                int64_t t;

 #pragma omp parallel for private(t)
-		for (t = 0; t < T; t++) {
+                for (t = 0; t < T; t++) {

-			THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
-			THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
-			THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
+                        THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
+                        THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+                        THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);

-			THNN_(TemporalRowConvolution_updateGradInput_frame)
-			        (gradInput_t, gradOutput_t, tweight, fgradInput_t,
-			        kW, dW, padW,
-			        inputFrameSize, nInputFrame, nOutputFrame);
+                        THNN_(TemporalRowConvolution_updateGradInput_frame)
+                                (gradInput_t, gradOutput_t, tweight, fgradInput_t,
+                                kW, dW, padW,
+                                inputFrameSize, nInputFrame, nOutputFrame);

-			c10::raw::intrusive_ptr::decref(gradInput_t);
-			c10::raw::intrusive_ptr::decref(gradOutput_t);
-			c10::raw::intrusive_ptr::decref(fgradInput_t);
-		}
-	}
+                        c10::raw::intrusive_ptr::decref(gradInput_t);
+                        c10::raw::intrusive_ptr::decref(gradOutput_t);
+                        c10::raw::intrusive_ptr::decref(fgradInput_t);
+                }
+        }

    c10::raw::intrusive_ptr::decref(tweight);

-	if (!featFirst) { // NOTE: gradInput will NOT be contiguous in this case
+        if (!featFirst) { // NOTE: gradInput will NOT be contiguous in this case

-		c10::raw::intrusive_ptr::decref(tinput);
-		c10::raw::intrusive_ptr::decref(tgradOutput);
+                c10::raw::intrusive_ptr::decref(tinput);
+                c10::raw::intrusive_ptr::decref(tgradOutput);

-		THTensor_(transpose)(gradInput, gradInput, ndim - 1, ndim - 2);
-	}
+                THTensor_(transpose)(gradInput, gradInput, ndim - 1, ndim - 2);
+        }

-	c10::raw::intrusive_ptr::decref(input);
-	c10::raw::intrusive_ptr::decref(gradOutput);
+        c10::raw::intrusive_ptr::decref(input);
+        c10::raw::intrusive_ptr::decref(gradOutput);

 }

 static void THNN_(TemporalRowConvolution_accGradParameters_frame)(
-	THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias,
-	THTensor *finput, scalar_t scale) {
+        THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias,
+        THTensor *finput, scalar_t scale) {

-	int64_t i;
-	THTensor *gradOutput3d = THTensor_(newWithStorage3d)(
-		THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(),
-		gradOutput->size(0), -1,
-		1, -1,
-		gradOutput->size(1), -1);
+        int64_t i;
+        THTensor *gradOutput3d = THTensor_(newWithStorage3d)(
+                THTensor_getStoragePtr(gradOutput), gradOutput->storage_offset(),
+                gradOutput->size(0), -1,
+                1, -1,
+                gradOutput->size(1), -1);

    THTensor *tfinput = THTensor_(new)();
-	THTensor_(transpose)(tfinput, finput, 1, 2);
-	// gradOutput3d:	inputFrameSize x 1 x nOutputFrame
-	// finput:			inputFrameSize x nOutputFrame x kW
-	THTensor_(baddbmm)(gradWeight, 1, gradWeight, scale, gradOutput3d, tfinput);
-	// gradWeight:		inputFrameSize x 1 x kW
+        THTensor_(transpose)(tfinput, finput, 1, 2);
+        // gradOutput3d:        inputFrameSize x 1 x nOutputFrame
+        // finput:                        inputFrameSize x nOutputFrame x kW
+        THTensor_(baddbmm)(gradWeight, 1, gradWeight, scale, gradOutput3d, tfinput);
+        // gradWeight:                inputFrameSize x 1 x kW
    c10::raw::intrusive_ptr::decref(tfinput);

-	if (gradBias != NULL) {
-		for (i = 0; i < THTensor_sizeLegacyNoScalars(gradBias, 0); i++) {
-			int64_t k;
-			scalar_t sum = 0;
-			scalar_t *data = THStorage_(data)(THTensor_getStoragePtr(gradOutput3d))
-			             + gradOutput3d->storage_offset()
-			             + i * gradOutput3d->stride(0);
-			for (k = 0; k < gradOutput3d->size(2); k++) {
-				sum += data[k];
-			}
-			(THStorage_(data)(THTensor_getStoragePtr(gradBias)) + gradBias->storage_offset())[i]
-			        += scale * sum;
-		}
-	}
+        if (gradBias != NULL) {
+                for (i = 0; i < THTensor_sizeLegacyNoScalars(gradBias, 0); i++) {
+                        int64_t k;
+                        scalar_t sum = 0;
+                        scalar_t *data = THStorage_(data)(THTensor_getStoragePtr(gradOutput3d))
+                                     + gradOutput3d->storage_offset()
+                                     + i * gradOutput3d->stride(0);
+                        for (k = 0; k < gradOutput3d->size(2); k++) {
+                                sum += data[k];
+                        }
+                        (THStorage_(data)(THTensor_getStoragePtr(gradBias)) + gradBias->storage_offset())[i]
+                                += scale * sum;
+                }
+        }

-	c10::raw::intrusive_ptr::decref(gradOutput3d);
+        c10::raw::intrusive_ptr::decref(gradOutput3d);

 }

 void THNN_(TemporalRowConvolution_accGradParameters)(
-	THNNState *state,
-	THTensor *input,
-	THTensor *gradOutput,
-	THTensor *gradWeight,
-	THTensor *gradBias,
-	THTensor *finput,
-	THTensor *fgradInput,
-	int kW,
-	int dW,
-	int padW,
-	bool featFirst,
-	accreal scale_) {
+        THNNState *state,
+        THTensor *input,
+        THTensor *gradOutput,
+        THTensor *gradWeight,
+        THTensor *gradBias,
+        THTensor *finput,
+        THTensor *fgradInput,
+        int kW,
+        int dW,
+        int padW,
+        bool featFirst,
+        accreal scale_) {

    scalar_t scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
-	int ndim = input->dim();
+        int ndim = input->dim();

-	THTensor *tinput = NULL;
-	THTensor *tgradOutput = NULL;
+        THTensor *tinput = NULL;
+        THTensor *tgradOutput = NULL;

-	if (!featFirst) {
-		tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
-		tgradOutput = THTensor_(newTranspose)(gradOutput, ndim - 1, ndim - 2);
+        if (!featFirst) {
+                tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
+                tgradOutput = THTensor_(newTranspose)(gradOutput, ndim - 1, ndim - 2);

-		input = THTensor_(newContiguous)(tinput);
-		gradOutput = THTensor_(newContiguous)(tgradOutput);
-	} else {
-		input = THTensor_(newContiguous)(input);
-		gradOutput = THTensor_(newContiguous)(gradOutput);
-	}
+                input = THTensor_(newContiguous)(tinput);
+                gradOutput = THTensor_(newContiguous)(tgradOutput);
+        } else {
+                input = THTensor_(newContiguous)(input);
+                gradOutput = THTensor_(newContiguous)(gradOutput);
+        }

-	THNN_(TemporalRowConvolution_shapeCheck)
-	        (state, input, gradOutput, gradWeight, gradBias, kW, dW, padW);
+        THNN_(TemporalRowConvolution_shapeCheck)
+                (state, input, gradOutput, gradWeight, gradBias, kW, dW, padW);

-	if (ndim == 2) {
-		THNN_(TemporalRowConvolution_accGradParameters_frame)(
-			gradOutput, gradWeight, gradBias, finput, scale);
-	} else {
-		int64_t T = input->size(0);
-		int64_t t;
+        if (ndim == 2) {
+                THNN_(TemporalRowConvolution_accGradParameters_frame)(
+                        gradOutput, gradWeight, gradBias, finput, scale);
+        } else {
+                int64_t T = input->size(0);
+                int64_t t;

-		for (t = 0; t < T; t++) {
-			THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
-			THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+                for (t = 0; t < T; t++) {
+                        THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+                        THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);

-			THNN_(TemporalRowConvolution_accGradParameters_frame)(
-				gradOutput_t, gradWeight, gradBias, finput_t, scale);
+                        THNN_(TemporalRowConvolution_accGradParameters_frame)(
+                                gradOutput_t, gradWeight, gradBias, finput_t, scale);

-			c10::raw::intrusive_ptr::decref(gradOutput_t);
-			c10::raw::intrusive_ptr::decref(finput_t);
-		}
-	}
+                        c10::raw::intrusive_ptr::decref(gradOutput_t);
+                        c10::raw::intrusive_ptr::decref(finput_t);
+                }
+        }

-	if (!featFirst) {
-		c10::raw::intrusive_ptr::decref(tinput);
-		c10::raw::intrusive_ptr::decref(tgradOutput);
-	}
+        if (!featFirst) {
+                c10::raw::intrusive_ptr::decref(tinput);
+                c10::raw::intrusive_ptr::decref(tgradOutput);
+        }

-	c10::raw::intrusive_ptr::decref(input);
-	c10::raw::intrusive_ptr::decref(gradOutput);
+        c10::raw::intrusive_ptr::decref(input);
+        c10::raw::intrusive_ptr::decref(gradOutput);
 }

 #endif
--- a/aten/src/THNN/generic/VolumetricAdaptiveAveragePooling.c
+++ b/aten/src/THNN/generic/VolumetricAdaptiveAveragePooling.c
@ -105,7 +105,7 @@ void THNN_(VolumetricAdaptiveAveragePooling_updateOutput)(


  THNN_ARGCHECK(!input->is_empty() && (input->dim() == 4 || input->dim() == 5), 2, input,
-		"non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s");
+                "non-empty 4D or 5D (batch mode) tensor expected for input, but got: %s");

  if (input->dim() == 5)
  {
--- a/aten/src/THNN/generic/VolumetricAveragePooling.c
+++ b/aten/src/THNN/generic/VolumetricAveragePooling.c
@ -75,7 +75,7 @@ static inline void THNN_(VolumetricAveragePooling_shapeCheck)(

  if (otime < 1 || owidth < 1 || oheight < 1)
    THError("Given input size: (%dx%dx%dx%d). "
-	    "Calculated output size: (%dx%dx%dx%d). Output size is too small",
+            "Calculated output size: (%dx%dx%dx%d). Output size is too small",
            nslices,itime,iheight,iwidth,nslices,otime,oheight,owidth);

  if (gradOutput != NULL) {
--- a/aten/src/THNN/generic/VolumetricConvolutionMM.c
+++ b/aten/src/THNN/generic/VolumetricConvolutionMM.c
@ -119,7 +119,7 @@ static THTensor* THNN_(newViewWeight)(THTensor *weight)
    int64_t s2 = weight->size(1) * weight->size(2) * weight->size(3) * weight->size(4);
    THTensor *old_weight = weight;
    weight = THTensor_(newWithStorage2d)(THTensor_getStoragePtr(weight), weight->storage_offset(),
-					 s1, -1, s2, -1);
+                                         s1, -1, s2, -1);
    c10::raw::intrusive_ptr::decref(old_weight);
  }
  return weight;
--- a/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c
+++ b/aten/src/THNN/generic/VolumetricFullDilatedConvolution.c
@ -274,7 +274,7 @@ void THNN_(VolumetricFullDilatedConvolution_updateOutput)(
    const int64_t k_ = 1;

    // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
-	if (bias) {
+        if (bias) {
      THBlas_(gemm)(
        't', 'n',
        n_, m_, k_,
--- a/aten/src/THNN/init.cpp
+++ b/aten/src/THNN/init.cpp
@ -7,13 +7,13 @@
 #define torch_(NAME) TH_CONCAT_3(torch_, Real, NAME)
 #define nn_(NAME) TH_CONCAT_3(nn_, Real, NAME)

-#define THNN_CHECK_SHAPE(I1, I2)			\
-  if (I1 != NULL && I2 != NULL && !THTensor_(isSameSizeAs)(I1, I2))	\
-    {							\
-       THDescBuff s1 = THTensor_(sizeDesc)(I1);		\
-       THDescBuff s2 = THTensor_(sizeDesc)(I2);		\
-       THError(#I1 " and " #I2 " shapes do not match: "	\
-	       #I1 " %s, " #I2 " %s", s1.str, s2.str);	\
+#define THNN_CHECK_SHAPE(I1, I2)                        \
+  if (I1 != NULL && I2 != NULL && !THTensor_(isSameSizeAs)(I1, I2))        \
+    {                                                        \
+       THDescBuff s1 = THTensor_(sizeDesc)(I1);                \
+       THDescBuff s2 = THTensor_(sizeDesc)(I2);                \
+       THError(#I1 " and " #I2 " shapes do not match: "        \
+               #I1 " %s, " #I2 " %s", s1.str, s2.str);        \
    }

 #define THNN_CHECK_SHAPE_INDICES(I1, I2)             \
@ -26,39 +26,39 @@
    }

 #define THNN_CHECK_NELEMENT(I1, I2) \
-  if (I1 != NULL && I2 != NULL ) {					\
-    ptrdiff_t n1 = THTensor_(nElement)(I1);					\
-    ptrdiff_t n2 = THTensor_(nElement)(I2);	                                \
-    if (n1 != n2)							\
-      {									\
-	THDescBuff s1 = THTensor_(sizeDesc)(I1);			\
-	THDescBuff s2 = THTensor_(sizeDesc)(I2);			\
-	THError(#I1 " and " #I2 " have different number of elements: "	\
-		#I1 "%s has %ld elements, while "			\
-		#I2 "%s has %ld elements", s1.str, n1, s2.str, n2);	\
-      }									\
+  if (I1 != NULL && I2 != NULL ) {                                        \
+    ptrdiff_t n1 = THTensor_(nElement)(I1);                                        \
+    ptrdiff_t n2 = THTensor_(nElement)(I2);                                        \
+    if (n1 != n2)                                                        \
+      {                                                                        \
+        THDescBuff s1 = THTensor_(sizeDesc)(I1);                        \
+        THDescBuff s2 = THTensor_(sizeDesc)(I2);                        \
+        THError(#I1 " and " #I2 " have different number of elements: "        \
+                #I1 "%s has %ld elements, while "                        \
+                #I2 "%s has %ld elements", s1.str, n1, s2.str, n2);        \
+      }                                                                        \
  }

-#define THNN_CHECK_DIM_SIZE(T, DIM, DIM_SIZE, SIZE)			\
-  if (THTensor_(nDimensionLegacyNoScalars)(T) != DIM ||				\
-      THTensor_sizeLegacyNoScalars(T, DIM_SIZE) != SIZE) {				\
-      THDescBuff s1 = THTensor_(sizeDesc)(T);				\
-      THError("Need " #T " of dimension %d and " #T ".size[%d] == %d"	\
-	      " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
+#define THNN_CHECK_DIM_SIZE(T, DIM, DIM_SIZE, SIZE)                        \
+  if (THTensor_(nDimensionLegacyNoScalars)(T) != DIM ||                                \
+      THTensor_sizeLegacyNoScalars(T, DIM_SIZE) != SIZE) {                                \
+      THDescBuff s1 = THTensor_(sizeDesc)(T);                                \
+      THError("Need " #T " of dimension %d and " #T ".size[%d] == %d"        \
+              " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
  }

-#define THNN_CHECK_DIM_SIZE_INDICES(T, DIM, DIM_SIZE, SIZE)			\
-  if (THIndexTensor_(nDimensionLegacyNoScalars)(T) != DIM ||				\
-      THTensor_sizeLegacyNoScalars(T, DIM_SIZE) != SIZE) {				\
-      THDescBuff s1 = THIndexTensor_(sizeDesc)(T);				\
-      THError("Need " #T " of dimension %d and " #T ".size[%d] == %d"	\
+#define THNN_CHECK_DIM_SIZE_INDICES(T, DIM, DIM_SIZE, SIZE)                        \
+  if (THIndexTensor_(nDimensionLegacyNoScalars)(T) != DIM ||                                \
+      THTensor_sizeLegacyNoScalars(T, DIM_SIZE) != SIZE) {                                \
+      THDescBuff s1 = THIndexTensor_(sizeDesc)(T);                                \
+      THError("Need " #T " of dimension %d and " #T ".size[%d] == %d"        \
        " but got " #T " to be of shape: %s", DIM, DIM_SIZE, SIZE, s1.str); \
  }

-#define THNN_ARGCHECK(COND, ARG, T, FORMAT)	\
-  if (!(COND)) {				\
-    THDescBuff s1 = THTensor_(sizeDesc)(T);	\
-    THArgCheck(COND, ARG, FORMAT, s1.str);	\
+#define THNN_ARGCHECK(COND, ARG, T, FORMAT)        \
+  if (!(COND)) {                                \
+    THDescBuff s1 = THTensor_(sizeDesc)(T);        \
+    THArgCheck(COND, ARG, FORMAT, s1.str);        \
  }

 #include <THNN/generic/AbsCriterion.c>
--- a/c10/test/util/LeftRight_test.cpp
+++ b/c10/test/util/LeftRight_test.cpp
@ -206,7 +206,7 @@ TEST(LeftRightTest, givenInt_whenWriteThrowsExceptionOnSecondCall_thenKeepsNewSt
                write_called = true;
            }
        }),
-	MyException
+        MyException
    );

    // check reading it returns new value
--- a/c10/util/Half.h
+++ b/c10/util/Half.h
@ -85,41 +85,41 @@ namespace detail {
   * @note The implementation doesn't use any floating-point operations.
   */
  inline uint32_t fp16_ieee_to_fp32_bits(uint16_t h) {
-  	/*
-  	 * Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
-  	 *      +---+-----+------------+-------------------+
-  	 *      | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
-  	 *      +---+-----+------------+-------------------+
-  	 * Bits  31  26-30    16-25            0-15
-  	 *
-  	 * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
-  	 */
-  	const uint32_t w = (uint32_t) h << 16;
-  	/*
-  	 * Extract the sign of the input number into the high bit of the 32-bit word:
-  	 *
-  	 *      +---+----------------------------------+
-  	 *      | S |0000000 00000000 00000000 00000000|
-  	 *      +---+----------------------------------+
-  	 * Bits  31                 0-31
-  	 */
-  	const uint32_t sign = w & UINT32_C(0x80000000);
-  	/*
-  	 * Extract mantissa and biased exponent of the input number into the bits 0-30 of the 32-bit word:
-  	 *
-  	 *      +---+-----+------------+-------------------+
-  	 *      | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
-  	 *      +---+-----+------------+-------------------+
-  	 * Bits  30  27-31     17-26            0-16
-  	 */
-  	const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
-  	/*
-  	 * Renorm shift is the number of bits to shift mantissa left to make the half-precision number normalized.
-  	 * If the initial number is normalized, some of its high 6 bits (sign == 0 and 5-bit exponent) equals one.
-  	 * In this case renorm_shift == 0. If the number is denormalize, renorm_shift > 0. Note that if we shift
-  	 * denormalized nonsign by renorm_shift, the unit bit of mantissa will shift into exponent, turning the
-  	 * biased exponent into 1, and making mantissa normalized (i.e. without leading 1).
-  	 */
+          /*
+           * Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
+           *      +---+-----+------------+-------------------+
+           *      | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
+           *      +---+-----+------------+-------------------+
+           * Bits  31  26-30    16-25            0-15
+           *
+           * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
+           */
+          const uint32_t w = (uint32_t) h << 16;
+          /*
+           * Extract the sign of the input number into the high bit of the 32-bit word:
+           *
+           *      +---+----------------------------------+
+           *      | S |0000000 00000000 00000000 00000000|
+           *      +---+----------------------------------+
+           * Bits  31                 0-31
+           */
+          const uint32_t sign = w & UINT32_C(0x80000000);
+          /*
+           * Extract mantissa and biased exponent of the input number into the bits 0-30 of the 32-bit word:
+           *
+           *      +---+-----+------------+-------------------+
+           *      | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
+           *      +---+-----+------------+-------------------+
+           * Bits  30  27-31     17-26            0-16
+           */
+          const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF);
+          /*
+           * Renorm shift is the number of bits to shift mantissa left to make the half-precision number normalized.
+           * If the initial number is normalized, some of its high 6 bits (sign == 0 and 5-bit exponent) equals one.
+           * In this case renorm_shift == 0. If the number is denormalize, renorm_shift > 0. Note that if we shift
+           * denormalized nonsign by renorm_shift, the unit bit of mantissa will shift into exponent, turning the
+           * biased exponent into 1, and making mantissa normalized (i.e. without leading 1).
+           */
 #ifdef _MSC_VER
        unsigned long nonsign_bsr;
        _BitScanReverse(&nonsign_bsr, (unsigned long)nonsign);
@ -176,62 +176,62 @@ namespace detail {
   * floating-point operations and bitcasts between integer and floating-point variables.
   */
  inline float fp16_ieee_to_fp32_value(uint16_t h) {
-  	/*
-  	 * Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
-  	 *      +---+-----+------------+-------------------+
-  	 *      | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
-  	 *      +---+-----+------------+-------------------+
-  	 * Bits  31  26-30    16-25            0-15
-  	 *
-  	 * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
-  	 */
-  	const uint32_t w = (uint32_t) h << 16;
-  	/*
-  	 * Extract the sign of the input number into the high bit of the 32-bit word:
-  	 *
-  	 *      +---+----------------------------------+
-  	 *      | S |0000000 00000000 00000000 00000000|
-  	 *      +---+----------------------------------+
-  	 * Bits  31                 0-31
-  	 */
-  	const uint32_t sign = w & UINT32_C(0x80000000);
-  	/*
-  	 * Extract mantissa and biased exponent of the input number into the high bits of the 32-bit word:
-  	 *
-  	 *      +-----+------------+---------------------+
-  	 *      |EEEEE|MM MMMM MMMM|0 0000 0000 0000 0000|
-  	 *      +-----+------------+---------------------+
-  	 * Bits  27-31    17-26            0-16
-  	 */
-  	const uint32_t two_w = w + w;
+          /*
+           * Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word:
+           *      +---+-----+------------+-------------------+
+           *      | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000|
+           *      +---+-----+------------+-------------------+
+           * Bits  31  26-30    16-25            0-15
+           *
+           * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits.
+           */
+          const uint32_t w = (uint32_t) h << 16;
+          /*
+           * Extract the sign of the input number into the high bit of the 32-bit word:
+           *
+           *      +---+----------------------------------+
+           *      | S |0000000 00000000 00000000 00000000|
+           *      +---+----------------------------------+
+           * Bits  31                 0-31
+           */
+          const uint32_t sign = w & UINT32_C(0x80000000);
+          /*
+           * Extract mantissa and biased exponent of the input number into the high bits of the 32-bit word:
+           *
+           *      +-----+------------+---------------------+
+           *      |EEEEE|MM MMMM MMMM|0 0000 0000 0000 0000|
+           *      +-----+------------+---------------------+
+           * Bits  27-31    17-26            0-16
+           */
+          const uint32_t two_w = w + w;

-  	/*
-  	 * Shift mantissa and exponent into bits 23-28 and bits 13-22 so they become mantissa and exponent
-  	 * of a single-precision floating-point number:
-  	 *
-  	 *       S|Exponent |          Mantissa
-  	 *      +-+---+-----+------------+----------------+
-  	 *      |0|000|EEEEE|MM MMMM MMMM|0 0000 0000 0000|
-  	 *      +-+---+-----+------------+----------------+
-  	 * Bits   | 23-31   |           0-22
-  	 *
-  	 * Next, there are some adjustments to the exponent:
-  	 * - The exponent needs to be corrected by the difference in exponent bias between single-precision and half-precision
-  	 *   formats (0x7F - 0xF = 0x70)
-  	 * - Inf and NaN values in the inputs should become Inf and NaN values after conversion to the single-precision number.
-  	 *   Therefore, if the biased exponent of the half-precision input was 0x1F (max possible value), the biased exponent
-  	 *   of the single-precision output must be 0xFF (max possible value). We do this correction in two steps:
-  	 *   - First, we adjust the exponent by (0xFF - 0x1F) = 0xE0 (see exp_offset below) rather than by 0x70 suggested
-  	 *     by the difference in the exponent bias (see above).
-  	 *   - Then we multiply the single-precision result of exponent adjustment by 2**(-112) to reverse the effect of
-  	 *     exponent adjustment by 0xE0 less the necessary exponent adjustment by 0x70 due to difference in exponent bias.
-  	 *     The floating-point multiplication hardware would ensure than Inf and NaN would retain their value on at least
-  	 *     partially IEEE754-compliant implementations.
-  	 *
-  	 * Note that the above operations do not handle denormal inputs (where biased exponent == 0). However, they also do not
-  	 * operate on denormal inputs, and do not produce denormal results.
-  	 */
-  	const uint32_t exp_offset = UINT32_C(0xE0) << 23;
+          /*
+           * Shift mantissa and exponent into bits 23-28 and bits 13-22 so they become mantissa and exponent
+           * of a single-precision floating-point number:
+           *
+           *       S|Exponent |          Mantissa
+           *      +-+---+-----+------------+----------------+
+           *      |0|000|EEEEE|MM MMMM MMMM|0 0000 0000 0000|
+           *      +-+---+-----+------------+----------------+
+           * Bits   | 23-31   |           0-22
+           *
+           * Next, there are some adjustments to the exponent:
+           * - The exponent needs to be corrected by the difference in exponent bias between single-precision and half-precision
+           *   formats (0x7F - 0xF = 0x70)
+           * - Inf and NaN values in the inputs should become Inf and NaN values after conversion to the single-precision number.
+           *   Therefore, if the biased exponent of the half-precision input was 0x1F (max possible value), the biased exponent
+           *   of the single-precision output must be 0xFF (max possible value). We do this correction in two steps:
+           *   - First, we adjust the exponent by (0xFF - 0x1F) = 0xE0 (see exp_offset below) rather than by 0x70 suggested
+           *     by the difference in the exponent bias (see above).
+           *   - Then we multiply the single-precision result of exponent adjustment by 2**(-112) to reverse the effect of
+           *     exponent adjustment by 0xE0 less the necessary exponent adjustment by 0x70 due to difference in exponent bias.
+           *     The floating-point multiplication hardware would ensure than Inf and NaN would retain their value on at least
+           *     partially IEEE754-compliant implementations.
+           *
+           * Note that the above operations do not handle denormal inputs (where biased exponent == 0). However, they also do not
+           * operate on denormal inputs, and do not produce denormal results.
+           */
+          const uint32_t exp_offset = UINT32_C(0xE0) << 23;
    // const float exp_scale = 0x1.0p-112f;
    uint32_t scale_bits = (uint32_t) 15 << 23;
    float exp_scale_val;
@ -239,48 +239,48 @@ namespace detail {
    const float exp_scale = exp_scale_val;
    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;

-  	/*
-  	 * Convert denormalized half-precision inputs into single-precision results (always normalized).
-  	 * Zero inputs are also handled here.
-  	 *
-  	 * In a denormalized number the biased exponent is zero, and mantissa has on-zero bits.
-  	 * First, we shift mantissa into bits 0-9 of the 32-bit word.
-  	 *
-  	 *                  zeros           |  mantissa
-  	 *      +---------------------------+------------+
-  	 *      |0000 0000 0000 0000 0000 00|MM MMMM MMMM|
-  	 *      +---------------------------+------------+
-  	 * Bits             10-31                0-9
-  	 *
-  	 * Now, remember that denormalized half-precision numbers are represented as:
-  	 *    FP16 = mantissa * 2**(-24).
-  	 * The trick is to construct a normalized single-precision number with the same mantissa and thehalf-precision input
-  	 * and with an exponent which would scale the corresponding mantissa bits to 2**(-24).
-  	 * A normalized single-precision floating-point number is represented as:
-  	 *    FP32 = (1 + mantissa * 2**(-23)) * 2**(exponent - 127)
-  	 * Therefore, when the biased exponent is 126, a unit change in the mantissa of the input denormalized half-precision
-  	 * number causes a change of the constructud single-precision number by 2**(-24), i.e. the same ammount.
-  	 *
-  	 * The last step is to adjust the bias of the constructed single-precision number. When the input half-precision number
-  	 * is zero, the constructed single-precision number has the value of
-  	 *    FP32 = 1 * 2**(126 - 127) = 2**(-1) = 0.5
-  	 * Therefore, we need to subtract 0.5 from the constructed single-precision number to get the numerical equivalent of
-  	 * the input half-precision number.
-  	 */
-  	const uint32_t magic_mask = UINT32_C(126) << 23;
-  	const float magic_bias = 0.5f;
-  	const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
+          /*
+           * Convert denormalized half-precision inputs into single-precision results (always normalized).
+           * Zero inputs are also handled here.
+           *
+           * In a denormalized number the biased exponent is zero, and mantissa has on-zero bits.
+           * First, we shift mantissa into bits 0-9 of the 32-bit word.
+           *
+           *                  zeros           |  mantissa
+           *      +---------------------------+------------+
+           *      |0000 0000 0000 0000 0000 00|MM MMMM MMMM|
+           *      +---------------------------+------------+
+           * Bits             10-31                0-9
+           *
+           * Now, remember that denormalized half-precision numbers are represented as:
+           *    FP16 = mantissa * 2**(-24).
+           * The trick is to construct a normalized single-precision number with the same mantissa and thehalf-precision input
+           * and with an exponent which would scale the corresponding mantissa bits to 2**(-24).
+           * A normalized single-precision floating-point number is represented as:
+           *    FP32 = (1 + mantissa * 2**(-23)) * 2**(exponent - 127)
+           * Therefore, when the biased exponent is 126, a unit change in the mantissa of the input denormalized half-precision
+           * number causes a change of the constructud single-precision number by 2**(-24), i.e. the same ammount.
+           *
+           * The last step is to adjust the bias of the constructed single-precision number. When the input half-precision number
+           * is zero, the constructed single-precision number has the value of
+           *    FP32 = 1 * 2**(126 - 127) = 2**(-1) = 0.5
+           * Therefore, we need to subtract 0.5 from the constructed single-precision number to get the numerical equivalent of
+           * the input half-precision number.
+           */
+          const uint32_t magic_mask = UINT32_C(126) << 23;
+          const float magic_bias = 0.5f;
+          const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;

-  	/*
-  	 * - Choose either results of conversion of input as a normalized number, or as a denormalized number, depending on the
-  	 *   input exponent. The variable two_w contains input exponent in bits 27-31, therefore if its smaller than 2**27, the
-  	 *   input is either a denormal number, or zero.
-  	 * - Combine the result of conversion of exponent and mantissa with the sign of the input number.
-  	 */
-  	const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
-  	const uint32_t result = sign |
-  		(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
-  	return fp32_from_bits(result);
+          /*
+           * - Choose either results of conversion of input as a normalized number, or as a denormalized number, depending on the
+           *   input exponent. The variable two_w contains input exponent in bits 27-31, therefore if its smaller than 2**27, the
+           *   input is either a denormal number, or zero.
+           * - Combine the result of conversion of exponent and mantissa with the sign of the input number.
+           */
+          const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
+          const uint32_t result = sign |
+                  (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
+          return fp32_from_bits(result);
  }

  /*
@ -301,22 +301,22 @@ namespace detail {
    const float scale_to_inf = scale_to_inf_val;
    const float scale_to_zero = scale_to_zero_val;

-  	float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
+          float base = (fabsf(f) * scale_to_inf) * scale_to_zero;

-  	const uint32_t w = fp32_to_bits(f);
-  	const uint32_t shl1_w = w + w;
-  	const uint32_t sign = w & UINT32_C(0x80000000);
-  	uint32_t bias = shl1_w & UINT32_C(0xFF000000);
-  	if (bias < UINT32_C(0x71000000)) {
-  		bias = UINT32_C(0x71000000);
-  	}
+          const uint32_t w = fp32_to_bits(f);
+          const uint32_t shl1_w = w + w;
+          const uint32_t sign = w & UINT32_C(0x80000000);
+          uint32_t bias = shl1_w & UINT32_C(0xFF000000);
+          if (bias < UINT32_C(0x71000000)) {
+                  bias = UINT32_C(0x71000000);
+          }

-  	base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
-  	const uint32_t bits = fp32_to_bits(base);
-  	const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
-  	const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
-  	const uint32_t nonsign = exp_bits + mantissa_bits;
-  	return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
+          base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
+          const uint32_t bits = fp32_to_bits(base);
+          const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
+          const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
+          const uint32_t nonsign = exp_bits + mantissa_bits;
+          return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
  }

 } // namespace detail
--- a/caffe2/operators/assert_op.cc
+++ b/caffe2/operators/assert_op.cc
@ -55,7 +55,7 @@ Assertion Passed!

 </details>

-	)DOC")
+        )DOC")
    .Arg(
        "error_msg",
        "(*string*): custom error message to be thrown when the input does not pass assertion",
--- a/caffe2/operators/counter_ops.cc
+++ b/caffe2/operators/counter_ops.cc
@ -107,17 +107,17 @@ Testing CountUp operator...
 'count' value after CountUp test: 10

 Testing CountDown operator...
-'count' value after CountDown: 9	'done' value: False
-'count' value after CountDown: 8	'done' value: False
-'count' value after CountDown: 7	'done' value: False
-'count' value after CountDown: 6	'done' value: False
-'count' value after CountDown: 5	'done' value: False
-'count' value after CountDown: 4	'done' value: False
-'count' value after CountDown: 3	'done' value: False
-'count' value after CountDown: 2	'done' value: False
-'count' value after CountDown: 1	'done' value: False
-'count' value after CountDown: 0	'done' value: False
-'count' value after CountDown: -1	'done' value: True
+'count' value after CountDown: 9        'done' value: False
+'count' value after CountDown: 8        'done' value: False
+'count' value after CountDown: 7        'done' value: False
+'count' value after CountDown: 6        'done' value: False
+'count' value after CountDown: 5        'done' value: False
+'count' value after CountDown: 4        'done' value: False
+'count' value after CountDown: 3        'done' value: False
+'count' value after CountDown: 2        'done' value: False
+'count' value after CountDown: 1        'done' value: False
+'count' value after CountDown: 0        'done' value: False
+'count' value after CountDown: -1        'done' value: True
 ```

 </details>
--- a/caffe2/operators/expand_op.cc
+++ b/caffe2/operators/expand_op.cc
@ -24,11 +24,11 @@ OPERATOR_SCHEMA(Expand)
    .NumInputs(2)
    .NumOutputs(1)
    .SetDoc(R"DOC(
-	Broadcast the input tensor to a materialized new tensor using given shape.
-	Broadcast rule is similar to "numpy.array(input) * numpy.ones(shape)":
-	Dimensions are right alignment;
-	Two corresponding dimensions must have the same value, or one of them
-	equals to 1.
+        Broadcast the input tensor to a materialized new tensor using given shape.
+        Broadcast rule is similar to "numpy.array(input) * numpy.ones(shape)":
+        Dimensions are right alignment;
+        Two corresponding dimensions must have the same value, or one of them
+        equals to 1.
        In order to align with PyTorch's `expand`, `shape` is allowed to have entries
        equal to -1, which means to preserve the size of the corresponding dimension
        in `X` (so it's actually equivalent to equal to 1).
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@ -758,10 +758,10 @@ if(USE_CUDA)
    endif()
    if(CAFFE2_USE_CUDNN)
      IF(CUDNN_STATIC_LINKAGE)
-	LIST(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS
-	  caffe2::cudnn "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a" "dl")
+        LIST(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS
+          caffe2::cudnn "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libculibos.a" "dl")
      ELSE()
-	list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cudnn)
+        list(APPEND Caffe2_PUBLIC_CUDA_DEPENDENCY_LIBS caffe2::cudnn)
      ENDIF()
    else()
      caffe2_update_option(USE_CUDNN OFF)
@ -1166,7 +1166,7 @@ if (NOT BUILD_ATEN_MOBILE)

  CHECK_C_SOURCE_COMPILES("#include <stdint.h>
      static inline void cpuid(uint32_t *eax, uint32_t *ebx,
-      			 uint32_t *ecx, uint32_t *edx)
+                               uint32_t *ecx, uint32_t *edx)
      {
        uint32_t a = *eax, b, c = *ecx, d;
        asm volatile ( \"cpuid\" : \"+a\"(a), \"=b\"(b), \"+c\"(c), \"=d\"(d) );
--- a/cmake/Modules/FindCUB.cmake
+++ b/cmake/Modules/FindCUB.cmake
@ -3,16 +3,16 @@
 #  CUB_INCLUDE_DIRS - the CUB include directory

 find_path(CUB_INCLUDE_DIR
-	NAMES cub/cub.cuh
-	DOC "The directory where CUB includes reside"
+        NAMES cub/cub.cuh
+        DOC "The directory where CUB includes reside"
 )

 set(CUB_INCLUDE_DIRS ${CUB_INCLUDE_DIR})

 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(CUB
-	FOUND_VAR CUB_FOUND
-	REQUIRED_VARS CUB_INCLUDE_DIR
+        FOUND_VAR CUB_FOUND
+        REQUIRED_VARS CUB_INCLUDE_DIR
 )

 mark_as_advanced(CUB_FOUND)
--- a/cmake/Modules/FindMIOpen.cmake
+++ b/cmake/Modules/FindMIOpen.cmake
@ -35,20 +35,20 @@ find_package_handle_standard_args(
    MIOPEN DEFAULT_MSG MIOPEN_INCLUDE_DIR MIOPEN_LIBRARY)

 if(MIOPEN_FOUND)
-	# get MIOpen version
+        # get MIOpen version
  file(READ ${MIOPEN_INCLUDE_DIR}/version.h MIOPEN_HEADER_CONTENTS)
-	string(REGEX MATCH "define MIOPEN_VERSION_MAJOR * +([0-9]+)"
-				 MIOPEN_VERSION_MAJOR "${MIOPEN_HEADER_CONTENTS}")
-	string(REGEX REPLACE "define MIOPEN_VERSION_MAJOR * +([0-9]+)" "\\1"
-				 MIOPEN_VERSION_MAJOR "${MIOPEN_VERSION_MAJOR}")
-	string(REGEX MATCH "define MIOPEN_VERSION_MINOR * +([0-9]+)"
-				 MIOPEN_VERSION_MINOR "${MIOPEN_HEADER_CONTENTS}")
-	string(REGEX REPLACE "define MIOPEN_VERSION_MINOR * +([0-9]+)" "\\1"
-				 MIOPEN_VERSION_MINOR "${MIOPEN_VERSION_MINOR}")
-	string(REGEX MATCH "define MIOPEN_VERSION_PATCH * +([0-9]+)"
-				 MIOPEN_VERSION_PATCH "${MIOPEN_HEADER_CONTENTS}")
-	string(REGEX REPLACE "define MIOPEN_VERSION_PATCH * +([0-9]+)" "\\1"
-				 MIOPEN_VERSION_PATCH "${MIOPEN_VERSION_PATCH}")
+        string(REGEX MATCH "define MIOPEN_VERSION_MAJOR * +([0-9]+)"
+                                 MIOPEN_VERSION_MAJOR "${MIOPEN_HEADER_CONTENTS}")
+        string(REGEX REPLACE "define MIOPEN_VERSION_MAJOR * +([0-9]+)" "\\1"
+                                 MIOPEN_VERSION_MAJOR "${MIOPEN_VERSION_MAJOR}")
+        string(REGEX MATCH "define MIOPEN_VERSION_MINOR * +([0-9]+)"
+                                 MIOPEN_VERSION_MINOR "${MIOPEN_HEADER_CONTENTS}")
+        string(REGEX REPLACE "define MIOPEN_VERSION_MINOR * +([0-9]+)" "\\1"
+                                 MIOPEN_VERSION_MINOR "${MIOPEN_VERSION_MINOR}")
+        string(REGEX MATCH "define MIOPEN_VERSION_PATCH * +([0-9]+)"
+                                 MIOPEN_VERSION_PATCH "${MIOPEN_HEADER_CONTENTS}")
+        string(REGEX REPLACE "define MIOPEN_VERSION_PATCH * +([0-9]+)" "\\1"
+                                 MIOPEN_VERSION_PATCH "${MIOPEN_VERSION_PATCH}")
  # Assemble MIOpen version
  if(NOT MIOPEN_VERSION_MAJOR)
    set(MIOPEN_VERSION "?")
--- a/cmake/Modules/Findpybind11.cmake
+++ b/cmake/Modules/Findpybind11.cmake
@ -3,16 +3,16 @@
 #  pybind11_INCLUDE_DIRS - the pybind11 include directory

 find_path(pybind11_INCLUDE_DIR
-	NAMES pybind11/pybind11.h
-	DOC "The directory where pybind11 includes reside"
+        NAMES pybind11/pybind11.h
+        DOC "The directory where pybind11 includes reside"
 )

 set(pybind11_INCLUDE_DIRS ${pybind11_INCLUDE_DIR})

 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(pybind11
-	FOUND_VAR pybind11_FOUND
-	REQUIRED_VARS pybind11_INCLUDE_DIR
+        FOUND_VAR pybind11_FOUND
+        REQUIRED_VARS pybind11_INCLUDE_DIR
 )

 mark_as_advanced(pybind11_FOUND)
--- a/docs/caffe2/stylesheet.css
+++ b/docs/caffe2/stylesheet.css
--- a/docs/cpp/source/notes/tensor_creation.rst
+++ b/docs/cpp/source/notes/tensor_creation.rst
@ -145,10 +145,10 @@ allowed values for these axes at the moment are:

 .. tip::

-	There exist "Rust-style" shorthands for dtypes, like ``kF32`` instead of
-	``kFloat32``. See `here
-	<https://github.com/pytorch/pytorch/blob/master/torch/csrc/api/include/torch/types.h>`_
-	for the full list.
+        There exist "Rust-style" shorthands for dtypes, like ``kF32`` instead of
+        ``kFloat32``. See `here
+        <https://github.com/pytorch/pytorch/blob/master/torch/csrc/api/include/torch/types.h>`_
+        for the full list.


 An instance of ``TensorOptions`` stores a concrete value for each of these
@ -314,8 +314,8 @@ we can convert it from ``int64`` to ``float32``:

 .. attention::

-	The result of the conversion, ``float_tensor``, is a new tensor pointing to
-	new memory, unrelated to the source ``source_tensor``.
+        The result of the conversion, ``float_tensor``, is a new tensor pointing to
+        new memory, unrelated to the source ``source_tensor``.

 We can then move it from CPU memory to GPU memory:

--- a/docs/make.bat
+++ b/docs/make.bat
@ -5,7 +5,7 @@ pushd %~dp0
 REM Command file for Sphinx documentation

 if "%SPHINXBUILD%" == "" (
-	set SPHINXBUILD=sphinx-build
+        set SPHINXBUILD=sphinx-build
 )
 set SOURCEDIR=source
 set BUILDDIR=build
@ -15,15 +15,15 @@ if "%1" == "" goto help

 %SPHINXBUILD% >NUL 2>NUL
 if errorlevel 9009 (
-	echo.
-	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
-	echo.installed, then set the SPHINXBUILD environment variable to point
-	echo.to the full path of the 'sphinx-build' executable. Alternatively you
-	echo.may add the Sphinx directory to PATH.
-	echo.
-	echo.If you don't have Sphinx installed, grab it from
-	echo.http://sphinx-doc.org/
-	exit /b 1
+        echo.
+        echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+        echo.installed, then set the SPHINXBUILD environment variable to point
+        echo.to the full path of the 'sphinx-build' executable. Alternatively you
+        echo.may add the Sphinx directory to PATH.
+        echo.
+        echo.If you don't have Sphinx installed, grab it from
+        echo.http://sphinx-doc.org/
+        exit /b 1
 )

 %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
--- a/docs/source/jit.rst
+++ b/docs/source/jit.rst
@ -770,34 +770,34 @@ Interpreting Graphs

    The example script above produces the graph::

-	graph(%len : int) {
-	  %15 : int = prim::Constant[value=1]()
-	  %9 : bool = prim::Constant[value=1]()
-	  %7 : Device = prim::Constant[value="cpu"]()
-	  %6 : int = prim::Constant[value=0]()
-	  %5 : int = prim::Constant[value=6]()
-	  %1 : int = prim::Constant[value=3]()
-	  %2 : int = prim::Constant[value=4]()
-	  %11 : int = prim::Constant[value=10]()
-	  %14 : float = prim::Constant[value=1]()
-	  %4 : int[] = prim::ListConstruct(%1, %2)
-	  %rv.1 : Tensor = aten::zeros(%4, %5, %6, %7)
-	  %rv : Tensor = prim::Loop(%len, %9, %rv.1)
-	    block0(%i : int, %13 : Tensor) {
-	      %12 : bool = aten::lt(%i, %11)
-	      %rv.4 : Tensor = prim::If(%12)
-		block0() {
-		  %rv.2 : Tensor = aten::sub(%13, %14, %15)
-		  -> (%rv.2)
-		}
-		block1() {
-		  %rv.3 : Tensor = aten::add(%13, %14, %15)
-		  -> (%rv.3)
-		}
-	      -> (%9, %rv.4)
-	    }
-	  return (%rv);
-	}
+        graph(%len : int) {
+          %15 : int = prim::Constant[value=1]()
+          %9 : bool = prim::Constant[value=1]()
+          %7 : Device = prim::Constant[value="cpu"]()
+          %6 : int = prim::Constant[value=0]()
+          %5 : int = prim::Constant[value=6]()
+          %1 : int = prim::Constant[value=3]()
+          %2 : int = prim::Constant[value=4]()
+          %11 : int = prim::Constant[value=10]()
+          %14 : float = prim::Constant[value=1]()
+          %4 : int[] = prim::ListConstruct(%1, %2)
+          %rv.1 : Tensor = aten::zeros(%4, %5, %6, %7)
+          %rv : Tensor = prim::Loop(%len, %9, %rv.1)
+            block0(%i : int, %13 : Tensor) {
+              %12 : bool = aten::lt(%i, %11)
+              %rv.4 : Tensor = prim::If(%12)
+                block0() {
+                  %rv.2 : Tensor = aten::sub(%13, %14, %15)
+                  -> (%rv.2)
+                }
+                block1() {
+                  %rv.3 : Tensor = aten::add(%13, %14, %15)
+                  -> (%rv.3)
+                }
+              -> (%9, %rv.4)
+            }
+          return (%rv);
+        }


    Take the instruction ``%rv.1 : Dynamic = aten::zeros(%3, %4, %5, %6)`` for
@ -850,39 +850,39 @@ Automatic Trace Checking
        traced = torch.jit.trace(loop_in_traced_fn, inputs, check_inputs=check_inputs)

    Gives us the following diagnostic information::
-	ERROR: Graphs differed across invocations!
-	Graph diff::
+        ERROR: Graphs differed across invocations!
+        Graph diff::

-		  graph(%x : Tensor) {
-		    %1 : int = prim::Constant[value=0]()
-		    %2 : int = prim::Constant[value=0]()
-		    %result.1 : Tensor = aten::select(%x, %1, %2)
-		    %4 : int = prim::Constant[value=0]()
-		    %5 : int = prim::Constant[value=0]()
-		    %6 : Tensor = aten::select(%x, %4, %5)
-		    %result.2 : Tensor = aten::mul(%result.1, %6)
-		    %8 : int = prim::Constant[value=0]()
-		    %9 : int = prim::Constant[value=1]()
-		    %10 : Tensor = aten::select(%x, %8, %9)
-		-   %result : Tensor = aten::mul(%result.2, %10)
-		+   %result.3 : Tensor = aten::mul(%result.2, %10)
-		?          ++
-		    %12 : int = prim::Constant[value=0]()
-		    %13 : int = prim::Constant[value=2]()
-		    %14 : Tensor = aten::select(%x, %12, %13)
-		+   %result : Tensor = aten::mul(%result.3, %14)
-		+   %16 : int = prim::Constant[value=0]()
-		+   %17 : int = prim::Constant[value=3]()
-		+   %18 : Tensor = aten::select(%x, %16, %17)
-		-   %15 : Tensor = aten::mul(%result, %14)
-		?     ^                                 ^
-		+   %19 : Tensor = aten::mul(%result, %18)
-		?     ^                                 ^
-		-   return (%15);
-		?             ^
-		+   return (%19);
-		?             ^
-		  }
+                  graph(%x : Tensor) {
+                    %1 : int = prim::Constant[value=0]()
+                    %2 : int = prim::Constant[value=0]()
+                    %result.1 : Tensor = aten::select(%x, %1, %2)
+                    %4 : int = prim::Constant[value=0]()
+                    %5 : int = prim::Constant[value=0]()
+                    %6 : Tensor = aten::select(%x, %4, %5)
+                    %result.2 : Tensor = aten::mul(%result.1, %6)
+                    %8 : int = prim::Constant[value=0]()
+                    %9 : int = prim::Constant[value=1]()
+                    %10 : Tensor = aten::select(%x, %8, %9)
+                -   %result : Tensor = aten::mul(%result.2, %10)
+                +   %result.3 : Tensor = aten::mul(%result.2, %10)
+                ?          ++
+                    %12 : int = prim::Constant[value=0]()
+                    %13 : int = prim::Constant[value=2]()
+                    %14 : Tensor = aten::select(%x, %12, %13)
+                +   %result : Tensor = aten::mul(%result.3, %14)
+                +   %16 : int = prim::Constant[value=0]()
+                +   %17 : int = prim::Constant[value=3]()
+                +   %18 : Tensor = aten::select(%x, %16, %17)
+                -   %15 : Tensor = aten::mul(%result, %14)
+                ?     ^                                 ^
+                +   %19 : Tensor = aten::mul(%result, %18)
+                ?     ^                                 ^
+                -   return (%15);
+                ?             ^
+                +   return (%19);
+                ?             ^
+                  }


    This message indicates to us that the computation differed between when
@ -912,19 +912,19 @@ Automatic Trace Checking

    Which produces::

-	graph(%x : Tensor) {
-	  %5 : bool = prim::Constant[value=1]()
-	  %1 : int = prim::Constant[value=0]()
-	  %result.1 : Tensor = aten::select(%x, %1, %1)
-	  %4 : int = aten::size(%x, %1)
-	  %result : Tensor = prim::Loop(%4, %5, %result.1)
-	    block0(%i : int, %7 : Tensor) {
-	      %10 : Tensor = aten::select(%x, %1, %i)
-	      %result.2 : Tensor = aten::mul(%7, %10)
-	      -> (%5, %result.2)
-	    }
-	  return (%result);
-	}
+        graph(%x : Tensor) {
+          %5 : bool = prim::Constant[value=1]()
+          %1 : int = prim::Constant[value=0]()
+          %result.1 : Tensor = aten::select(%x, %1, %1)
+          %4 : int = aten::size(%x, %1)
+          %result : Tensor = prim::Loop(%4, %5, %result.1)
+            block0(%i : int, %7 : Tensor) {
+              %10 : Tensor = aten::select(%x, %1, %i)
+              %result.2 : Tensor = aten::mul(%7, %10)
+              -> (%5, %result.2)
+            }
+          return (%result);
+        }

 Tracer Warnings
 ^^^^^^^^^^^^^^^
--- a/docs/source/notes/windows.rst
+++ b/docs/source/notes/windows.rst
@ -213,8 +213,8 @@ Multiprocessing error without if-clause protection
 .. code-block:: py3tb

    RuntimeError:
-   	An attempt has been made to start a new process before the
-   	current process has finished its bootstrapping phase.
+           An attempt has been made to start a new process before the
+           current process has finished its bootstrapping phase.

       This probably means that you are not using fork to start your
       child processes and you have forgotten to use the proper idiom
--- a/tools/pytorch.version
+++ b/tools/pytorch.version
@ -8,24 +8,24 @@
         PyInit*;
         init*;
         state;
-	 _ZGVZN2at*;
+         _ZGVZN2at*;
         _ZN2at*;
-	 _ZNK2at*Type*;
-	 _ZNK2at*Tensor*;
-	 _ZNK2at*Storage*;
-	 _ZNK2at*Scalar*;
-	 _ZNK2at*CUDA*;
-	 *2at7Context*;
-	 _ZTIN2at*;
-	 _ZTIZN2at*;
-	 _ZTSN2at*;
-	 _ZTSPN2at*;
-	 _ZTSZN2at*;
-	 _ZTVN2at*;
-	 _ZZN2at*;
-	 _Z*torch*;
-	 _Z*Tensor*;
-	 _Z*tensor*;
+         _ZNK2at*Type*;
+         _ZNK2at*Tensor*;
+         _ZNK2at*Storage*;
+         _ZNK2at*Scalar*;
+         _ZNK2at*CUDA*;
+         *2at7Context*;
+         _ZTIN2at*;
+         _ZTIZN2at*;
+         _ZTSN2at*;
+         _ZTSPN2at*;
+         _ZTSZN2at*;
+         _ZTVN2at*;
+         _ZZN2at*;
+         _Z*torch*;
+         _Z*Tensor*;
+         _Z*tensor*;
     local:
         *;
 };
--- a/torch/csrc/api/include/torch/nn/modules/conv.h
+++ b/torch/csrc/api/include/torch/nn/modules/conv.h
@ -18,9 +18,9 @@ struct ConvOptions {
      int64_t input_channels,
      int64_t output_channels,
      ExpandingArray<D> kernel_size) :
-		input_channels_(input_channels),
-		output_channels_(output_channels),
-		kernel_size_(std::move(kernel_size)) {}
+                input_channels_(input_channels),
+                output_channels_(output_channels),
+                kernel_size_(std::move(kernel_size)) {}

  /// The number of channels the input volumes will have.
  /// Changing this parameter after construction __has no effect__.
--- a/torch/csrc/jit/README.md
+++ b/torch/csrc/jit/README.md
@ -370,21 +370,21 @@ As the trace runs, individual operators create Nodes in the Graph being traced t
 torch::jit::Node* node = nullptr;
 std::shared_ptr<jit::tracer::TracingState> tracer_state;
 if (jit::tracer::isTracing()) {
-	tracer_state = jit::tracer::getTracingState();
-	at::Symbol op_name;
-	op_name = jit::Symbol::fromQualString("aten::__ilshift__");
-	node = tracer_state->graph->create(op_name, /*num_outputs=*/0);
-	jit::tracer::recordSourceLocation(node);
-	jit::tracer::addInputs(node, "self", self);
-	jit::tracer::addInputs(node, "other", other);
-	tracer_state->graph->insertNode(node);
+        tracer_state = jit::tracer::getTracingState();
+        at::Symbol op_name;
+        op_name = jit::Symbol::fromQualString("aten::__ilshift__");
+        node = tracer_state->graph->create(op_name, /*num_outputs=*/0);
+        jit::tracer::recordSourceLocation(node);
+        jit::tracer::addInputs(node, "self", self);
+        jit::tracer::addInputs(node, "other", other);
+        tracer_state->graph->insertNode(node);

-	jit::tracer::setTracingState(nullptr);
+        jit::tracer::setTracingState(nullptr);
 }
 TypeDefault::__ilshift__(self, other);
 if (tracer_state) {
-	jit::tracer::setTracingState(std::move(tracer_state));
-	jit::tracer::addOutput(node, self);
+        jit::tracer::setTracingState(std::move(tracer_state));
+        jit::tracer::addOutput(node, self);
 }
 ```

@ -412,15 +412,15 @@ Our frontends produce ASTs in the form of Tree objects. Trees are similar to [s-

 ```
 (-
-	(+
-	  (variable (ident x))
-	  (variable (ident y)))
-	(apply
-	  (.
-		(variable (ident z))
-		(ident sigmoid))
-	  (list)
-	  (list))))
+        (+
+          (variable (ident x))
+          (variable (ident y)))
+        (apply
+          (.
+                (variable (ident z))
+                (ident sigmoid))
+          (list)
+          (list))))
 ```

 This is printed in s-expression style with `(kind ...)` representing compound trees and `string_value` representing strings.
@ -454,16 +454,16 @@ The typical way to traverse a tree is to `switch` on the kind and then construct
 ```cpp
 switch (tree.kind()) {
  case TK_VAR:
-  	auto var = Var(tree); // construct tree-view
-	return environment_stack->getSugaredVar(var.name());
+          auto var = Var(tree); // construct tree-view
+        return environment_stack->getSugaredVar(var.name());
  case '.': {
-	auto select = Select(tree); // construct tree-view
-	auto sv = emitSugaredExpr(select.value(), 1);
-	return sv->attr(select.range(), method, select.selector().name());
+        auto select = Select(tree); // construct tree-view
+        auto sv = emitSugaredExpr(select.value(), 1);
+        return sv->attr(select.range(), method, select.selector().name());
  }
  case TK_APPLY: {
-	auto apply = Apply(tree); // construct tree-view
-	return emitApplyExpr(apply, n_binders);
+        auto apply = Apply(tree); // construct tree-view
+        return emitApplyExpr(apply, n_binders);
  } break;

 ```
@ -507,7 +507,7 @@ Tokens are either keywords (`def`), operators (`+`), literals (`3.4`), or identi

 ```cpp
 if (lexer.nextIf('+')) {
-	// handle + ...
+        // handle + ...
 }
 ```

@ -650,10 +650,10 @@ using Operation = std::function<int(Stack&)>;

 // schema: example_add(Tensor a, Tensor b) -> Tensor
 int example_add(Stack& stack) {
-	Tensor a, b;
-	// stack before: ? ? ? a b <- back
-	pop(stack, a, b); //Templated helper function
-	                  // that pops a, b and converts them to tensor
+        Tensor a, b;
+        // stack before: ? ? ? a b <- back
+        pop(stack, a, b); //Templated helper function
+                          // that pops a, b and converts them to tensor
    push(stack, a + b);
    // stack after:
    // ? ? ? c <- back
@ -1126,7 +1126,7 @@ As a more involved example, the following TorchScript snippet:
 ```python
@torch.jit.script
 def foo(a : Tensor, b : Tensor):
-	c = 2 * b
+        c = 2 * b
  a += 1
  if a.max() > 4:
    r = a[0]
--- a/torch/csrc/utils/pybind.h
+++ b/torch/csrc/utils/pybind.h
@ -71,15 +71,15 @@ public:
      auto size = tuple ? PyTuple_GET_SIZE(source) : PyList_GET_SIZE(source);
      v_value.resize(size);
      for (int idx = 0; idx < size; idx++) {
-	PyObject* obj = tuple ? PyTuple_GET_ITEM(source, idx) : PyList_GET_ITEM(source, idx);
-	if (THPVariable_Check(obj)) {
-	  v_value[idx] = THPVariable_Unpack(obj).item<int64_t>();
-	} else if (PyLong_Check(obj)) {
-	  // use THPUtils_unpackLong after it is safe to include python_numbers.h
-	  v_value[idx] = THPUtils_unpackLong(obj);
-	} else {
-	  return false;
-	}
+        PyObject* obj = tuple ? PyTuple_GET_ITEM(source, idx) : PyList_GET_ITEM(source, idx);
+        if (THPVariable_Check(obj)) {
+          v_value[idx] = THPVariable_Unpack(obj).item<int64_t>();
+        } else if (PyLong_Check(obj)) {
+          // use THPUtils_unpackLong after it is safe to include python_numbers.h
+          v_value[idx] = THPUtils_unpackLong(obj);
+        } else {
+          return false;
+        }
      }
      value = v_value;
      return true;
--- a/torch/csrc/utils/tensor_numpy.cpp
+++ b/torch/csrc/utils/tensor_numpy.cpp
@ -195,7 +195,7 @@ ScalarType numpy_dtype_to_aten(int dtype) {

 bool is_numpy_scalar(PyObject* obj) {
  return (PyArray_IsIntegerScalar(obj) ||
-	  PyArray_IsScalar(obj, Floating));
+          PyArray_IsScalar(obj, Floating));
 }

 }} // namespace torch::utils