Fix typos, via a Levenshtein-type corrector (#31523)

Summary: Should be non-semantic. Uses https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines to find likely typos, with https://github.com/bwignall/typochecker to help automate the checking. Uses an updated version of the tool used in https://github.com/pytorch/pytorch/pull/30606 . Pull Request resolved: https://github.com/pytorch/pytorch/pull/31523 Differential Revision: D19216749 Pulled By: mrshenli fbshipit-source-id: 7fd489cb9a77cd7e4950c1046f925d57524960ea
2025-10-20 21:14:14 +08:00 · 2020-01-17 16:01:29 -08:00
parent c8ca70e39d
commit f326045b37
252 changed files with 284 additions and 284 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -270,7 +270,7 @@ if (MSVC)
    endif()

    # /bigobj increases number of sections in .obj file, which is needed to link
-    # against libaries in Python 2.7 under Windows
+    # against libraries in Python 2.7 under Windows
    set(${flag_var} "${${flag_var}} /MP /bigobj")
  endforeach(flag_var)

--- a/2
+++ b/2
@ -10,7 +10,7 @@
 /test/test_c10d.py @pietern @mrshenli @zhaojuanmao
 /torch/utils/cpp_extension.py @goldsborough @fmassa @soumith @ezyang

-# Not there to stricly require the approval, but to be tagged as a reviewer
+# Not there to strictly require the approval, but to be tagged as a reviewer
 # on the PRs to push them into a high priority inbox.
 /torch/csrc/api/data/ @apaszke
 /torch/csrc/autograd/ @apaszke
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -24,7 +24,7 @@ else()
  set(CAFFE2_STATIC_LINK_CUDA_INT 0)
 endif()
 CONFIGURE_FILE(Config.h.in "${CMAKE_CURRENT_SOURCE_DIR}/Config.h")
-# TODO: Don't unconditionally generate CUDAConfig.h.in.  Unfortuantely,
+# TODO: Don't unconditionally generate CUDAConfig.h.in.  Unfortunately,
 # this file generates AT_ROCM_ENABLED() which is required by the miopen
 # files, which are compiled even if we are doing a vanilla CUDA build.
 # Once we properly split CUDA and HIP in ATen, we can remove this code.
--- a/aten/src/ATen/core/boxing/kernel_lambda.h
+++ b/aten/src/ATen/core/boxing/kernel_lambda.h
@ -8,7 +8,7 @@ namespace c10 {
 namespace detail {
  // WrapRuntimeKernelFunctor: Wraps any runtime functor into a functor that
  // inherits from c10::OperatorKernel, so it can be used as a c10 kernel.
-  // This can, for example, be used for lamdas, functors or even function pointers.
+  // This can, for example, be used for lambdas, functors or even function pointers.
  // In the case of function pointers, since it is a runtime function pointer,
  // there is an overhead for calling it whenever the kernel is invoked.
  template<class FuncType, class ReturnType, class ParameterList> class WrapRuntimeKernelFunctor_ {};
--- a/aten/src/ATen/core/function_schema.h
+++ b/aten/src/ATen/core/function_schema.h
@ -184,7 +184,7 @@ struct FunctionSchema {
  std::vector<Argument> returns_;
  // if true then this schema takes an arbitrary number of additional arguments
  // after the argument specified in arguments
-  // currently this is used primarily to represent 'primtive' operators whose
+  // currently this is used primarily to represent 'primitive' operators whose
  // arguments are not checked by schema
  bool is_vararg_;
  bool is_varret_;
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@ -1366,7 +1366,7 @@ struct getTypePtr_<at::optional<T>> final {
 } // namespace detail
 template <class T>
 inline TypePtr getTypePtr() {
-  // TODO: static_assert that a templated function exists, and throw a friendy
+  // TODO: static_assert that a templated function exists, and throw a friendly
  // error message if not
  return detail::getTypePtr_<T>::call();
 }
--- a/aten/src/ATen/cpu/vec256/vec256_base.h
+++ b/aten/src/ATen/cpu/vec256/vec256_base.h
@ -84,7 +84,7 @@ public:
  // a constexpr variable if we never odr-use it.  But it seems that some
  // versions GCC/Clang have buggy determinations on whether or not an
  // identifier is odr-used or not, and in any case it's hard to tell if
-  // a variable is odr-used or not.  So best to just cut the probem at the root.
+  // a variable is odr-used or not.  So best to just cut the problem at the root.
  static constexpr int size() {
    return 32 / sizeof(T);
  }
--- a/aten/src/ATen/cuda/CUDAGenerator.cpp
+++ b/aten/src/ATen/cuda/CUDAGenerator.cpp
@ -94,7 +94,7 @@ uint64_t CUDAGenerator::current_seed() const {
 }

 /**
- * Gets a nondeterminstic random number from /dev/urandom or time,
+ * Gets a nondeterministic random number from /dev/urandom or time,
 * seeds the CPUGenerator with it and then returns that number.
 * 
 * FIXME: You can move this function to Generator.cpp if the algorithm
--- a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
+++ b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
@ -53,7 +53,7 @@ namespace at { namespace cuda {
 // NOTE [ ATen NVRTC Stub and HIP ]
 //
 // ATen's NVRTC stub library, caffe2_nvrtc, provides dynamic loading of both
-// NVRTC and driver APIs. While the former is not yet suppoted for HIP, the
+// NVRTC and driver APIs. While the former is not yet supported for HIP, the
 // later is supported and needed (e.g., in CUDAHooks::getDeviceWithPrimaryContext()
 // used by tensor.pin_memory()).
 //
--- a/aten/src/ATen/cudnn/Descriptors.h
+++ b/aten/src/ATen/cudnn/Descriptors.h
@ -76,7 +76,7 @@ public:
  T* desc() const { return desc_.get(); }
  T* desc() { return desc_.get(); }

-  // Use mut_desc() to access the underlying desciptor pointer
+  // Use mut_desc() to access the underlying descriptor pointer
  // if you intend to modify what it points to (e.g., using
  // cudnnSetFooDescriptor).  This will ensure that the descriptor
  // is initialized.  Code in this file will use this function.
--- a/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h
+++ b/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h
@ -27,7 +27,7 @@ namespace c10 { namespace hip {
 // HIP occurs; instead, anywhere we see "CUDA", it actually means "HIP".
 // For example, when you use HIPified PyTorch, you say x.cuda() to
 // move a tensor onto ROCm device.  We call this situation "HIP
-// maquerading as CUDA".
+// masquerading as CUDA".
 //
 // This leads to a very awkward situation when we want to call c10_hip
 // code from PyTorch, since c10_hip is expecting things to be called
--- a/aten/src/ATen/miopen/Descriptors.h
+++ b/aten/src/ATen/miopen/Descriptors.h
@ -61,7 +61,7 @@ public:
  T* desc() const { return desc_.get(); }
  T* desc() { return desc_.get(); }

-  // Use mut_desc() to access the underlying desciptor pointer
+  // Use mut_desc() to access the underlying descriptor pointer
  // if you intend to modify what it points to (e.g., using
  // miopenSetFooDescriptor).  This will ensure that the descriptor
  // is initialized.  Code in this file will use this function.
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@ -1104,7 +1104,7 @@ Tensor _lu_solve_helper_cpu(const Tensor& self, const Tensor& LU_data, const Ten
  return self_working_copy;
 }

-// Supports arbitrary batch dimensions for self and LU_data (implicity LU_pivots also)
+// Supports arbitrary batch dimensions for self and LU_data (implicitly LU_pivots also)
 Tensor lu_solve(const Tensor& self, const Tensor& LU_data, const Tensor& LU_pivots) {
  TORCH_CHECK(self.dim() >= 2,
              "b should have at least 2 dimensions, but has ", self.dim(), " dimensions instead");
--- a/aten/src/ATen/native/LossMultiMargin.cpp
+++ b/aten/src/ATen/native/LossMultiMargin.cpp
@ -59,7 +59,7 @@ static inline void multi_margin_loss_cpu_kernel(
  using accscalar_t = at::acc_type<scalar_t, false>;

  // dim() != 0 check is for 1d input which produces a scalar output (that
-  // cannot be handeld by TensorAccessor)
+  // cannot be handled by TensorAccessor)
  if (reduction == Reduction::None && output.dim() > 0) {
    auto output_acc = output.accessor<scalar_t, 1>();
    for (int64_t t = 0; t < nframe; t++) {
--- a/aten/src/ATen/native/RNN.cpp
+++ b/aten/src/ATen/native/RNN.cpp
@ -295,7 +295,7 @@ static std::vector<QuantizedCellParamsDynamic> gather_quantized_params_dynamic(
  }
  return result;
 #else // USE_FBGEMM
-  TORCH_INTERNAL_ASSERT(false, "Tried to use quantized RNN wihtout FBGEMM!")
+  TORCH_INTERNAL_ASSERT(false, "Tried to use quantized RNN without FBGEMM!")
 #endif // USE_FBGEMM
 }

--- a/aten/src/ATen/native/Sorting.cpp
+++ b/aten/src/ATen/native/Sorting.cpp
@ -276,7 +276,7 @@ std::tuple<Tensor, Tensor> kthvalue(
  return at::kthvalue(self, k, dimname_to_position(self, dim), keepdim);
 }

-// this does not reduce to median with dim beause we don't want to copy twice
+// this does not reduce to median with dim because we don't want to copy twice
 Tensor median_cpu(const Tensor& self) {
  NoNamesGuard guard;
  TORCH_CHECK(self.numel() > 0, "median cannot be called with empty tensor");
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@ -618,7 +618,7 @@ Tensor index_select_sparse(const Tensor& self, int64_t dim, const Tensor& index)
    self - sparse tensor, its shape is sizes = sparse_shape + dense_shape
      indices - 2-D tensor of indices, shape is (sparse_dims, nnz)
      values - (1+len(dense_shape))-D tensor of values, shape is (nnz,) + dense_shape
-    index_select(dim, index) returns a sparse tensor with the follwing data
+    index_select(dim, index) returns a sparse tensor with the following data
      new_sizes = sizes[:dim] + (n,) + sizes[dim+1:]
      new_indices - shape is (sparse_dims, new_nnz)
      new_values - shape is (new_nnz,) + dense_shape
--- a/aten/src/ATen/native/Unfold3d.cpp
+++ b/aten/src/ATen/native/Unfold3d.cpp
@ -85,7 +85,7 @@ static void unfolded3d_copy(
    const int64_t input_hw = input_height * input_width;
    const int64_t input_dhw = input_hw * input_depth;

-    // the following variables are updated ouside the most inner loop
+    // the following variables are updated outside the most inner loop
    int64_t d = d_out * dT - pT + i;
    int64_t h = h_out * dH - pH + j;
    int64_t ofs = nip * input_dhw + d * input_hw + h * input_width;
--- a/aten/src/ATen/native/UpSample.h
+++ b/aten/src/ATen/native/UpSample.h
@ -28,7 +28,7 @@
 * are computed from the input and the output size;
 *
 *
- * When the scales are infered from the input and output sizes,
+ * When the scales are inferred from the input and output sizes,
 * we view each pixel as an area, idx + 0.5 as its center index.
 * Here is an example formula in 1D case.
 * if align_corners: center of two corner pixel areas are preserved,
--- a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
+++ b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp
@ -26,7 +26,7 @@ struct Dist {
  //     map :      This tells how to modify (a - b) to form the component that
  //                gets summed.
  //     red :      This tells how to sum the result of map up. This is
-  //                separate because the inf norm actuall uses max instead of
+  //                separate because the inf norm actually uses max instead of
  //                sum.
  //     finish :   This tells what to do with the aggregated value to compute
  //                the norm. Generally this is the result of val ^ (1 / p).
--- a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
+++ b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp
@ -158,7 +158,7 @@ namespace at { namespace native { namespace {
 *      `apply_fn` will be called multiple times, and together cover the entire
 *      output spatial space.
 *
- *  Now you should be able tp understand everything about the implementaion of
+ *  Now you should be able tp understand everything about the implementation of
 *  2D forward kernel shown at the beginning of this note.
 *
 **/
--- a/aten/src/ATen/native/cuda/Copy.cu
+++ b/aten/src/ATen/native/cuda/Copy.cu
@ -117,7 +117,7 @@ static void copy_kernel_cuda(TensorIterator& iter, bool non_blocking) {
  Device dst_device = iter.device(0);
  Device src_device = iter.device(1);

-  // Enable p2p access between devices. (No-op if it invovles the CPU)
+  // Enable p2p access between devices. (No-op if it involves the CPU)
  bool p2p_enabled = maybe_enable_p2p_access(dst_device, src_device);

  if (copy_requires_temporaries(iter, p2p_enabled)) {
--- a/aten/src/ATen/native/cuda/GridSampler.cu
+++ b/aten/src/ATen/native/cuda/GridSampler.cu
@ -364,7 +364,7 @@ namespace {

        // assuming grad_grid is contiguous
        // thus we can
-        //   1. use index with gGrid_sW to diectly compute gGrid_ptr_NHW
+        //   1. use index with gGrid_sW to directly compute gGrid_ptr_NHW
        //   2. directly assign to gGrid_ptr_NHW[0], gGrid_ptr_NHW[1]
        scalar_t *gGrid_ptr_NHW = grad_grid.data + index * gGrid_sW;
        gGrid_ptr_NHW[0] = gix_mult * gix;
@ -383,7 +383,7 @@ namespace {

        // assuming grad_grid is contiguous
        // thus we can
-        //   1. use index with gGrid_sW to diectly compute gGrid_ptr_NHW
+        //   1. use index with gGrid_sW to directly compute gGrid_ptr_NHW
        //   2. directly assign to gGrid_ptr_NHW[0], gGrid_ptr_NHW[1]
        scalar_t *gGrid_ptr_NHW = grad_grid.data + index * gGrid_sW;
        gGrid_ptr_NHW[0] = static_cast<scalar_t>(0);
@ -569,7 +569,7 @@ namespace {

        // assuming grad_grid is contiguous
        // thus we can
-        //   1. use index with gGrid_sW to diectly compute gGrid_ptr_NDHW
+        //   1. use index with gGrid_sW to directly compute gGrid_ptr_NDHW
        //   2. directly assign to gGrid_ptr_NDHW[0], gGrid_ptr_NDHW[1], gGrid_ptr_NDHW[2]
        scalar_t *gGrid_ptr_NDHW = grad_grid.data + index * gGrid_sW;
        gGrid_ptr_NDHW[0] = gix_mult * gix;
@ -591,7 +591,7 @@ namespace {

        // assuming grad_grid is contiguous
        // thus we can
-        //   1. use index with gGrid_sW to diectly compute gGrid_ptr_NDHW
+        //   1. use index with gGrid_sW to directly compute gGrid_ptr_NDHW
        //   2. directly assign to gGrid_ptr_NDHW[0], gGrid_ptr_NDHW[1], gGrid_ptr_NDHW[2]
        scalar_t *gGrid_ptr_NDHW = grad_grid.data + index * gGrid_sW;
        gGrid_ptr_NDHW[0] = static_cast<scalar_t>(0);
--- a/aten/src/ATen/native/cuda/Indexing.cu
+++ b/aten/src/ATen/native/cuda/Indexing.cu
@ -108,7 +108,7 @@ static Tensor wrapIndexOnce(const Tensor & index, int64_t dim, int64_t dim_size,
 }

 static std::vector<int64_t> computeLinearStride(const Tensor & tensor) {
-  // computes the stride as if tensor were contigous
+  // computes the stride as if tensor were contiguous
  auto sizes = tensor.sizes();
  std::vector<int64_t> stride(tensor.dim());
  stride[tensor.dim() - 1] = 1;
--- a/aten/src/ATen/native/cuda/Loops.cuh
+++ b/aten/src/ATen/native/cuda/Loops.cuh
@ -7,7 +7,7 @@
 //
 // The gpu_kernel_with_scalars generates specializations that support a
 // single scalar CPU argument, such as from `cuda_tensor + 5`. The CPU scalar
-// is lifted to a kernel paramter instead of copying to device memory.
+// is lifted to a kernel parameter instead of copying to device memory.
 // This should be  used in conjunction with TensorIterator::allow_cpu_scalars_,
 // which is the default for TensorIterator::binary_op. Otherwise, all inputs
 // and the output must be on the GPU.
--- a/aten/src/ATen/native/cuda/PersistentSoftmax.cuh
+++ b/aten/src/ATen/native/cuda/PersistentSoftmax.cuh
@ -51,7 +51,7 @@ __device__ __forceinline__ void warp_reduce(acc_t* sum) {
 // A "WARP" contains "C10_WARPS_SIZE" threads, these treads are guaranteed to belong to the same warp.
 // This is important because it means only __shfl_ instructions are required for reductions.
 // Note that this means WARP_SIZE must be a power of two and <= architecture warp size.
-// CUDA warp size is 32 for all existing GPU architecures, but there is no guarantee this will not change for future arch.
+// CUDA warp size is 32 for all existing GPU architectures, but there is no guarantee this will not change for future arch.
 // ROCm warp size is 64 for all currently ROCm-supported GPU architectures, but this may change for future archs.
 // is_log_softmax is a flag indicating whether SoftMax or LogSoftMax should be computed.
 // The template can be instantiated with any floating point type for the type arguments input_t, output_t and acc_t.
--- a/aten/src/ATen/native/cuda/SoftMax.cu
+++ b/aten/src/ATen/native/cuda/SoftMax.cu
@ -200,7 +200,7 @@ __global__ void cunn_SpatialSoftMaxForward(
    for (uint32_t inner_index = blockIdx.y * blockDim.y + threadIdx.y; inner_index < inner_size; inner_index += blockDim.y * gridDim.y) {
      const uint32_t data_offset = outer_offset + inner_index;
      ////////////////////////////////////////////////////////////
-      // These two blocks are really eqivalent, but specializing on
+      // These two blocks are really equivalent, but specializing on
      // blockDim.x == 1 makes the kernel faster when it's unused.
      // I didn't want to thread an extra template parameter, and nvcc
      // seems to be smart enough to hoist the if outside of the loops.
--- a/aten/src/ATen/native/cuda/SortingKthValue.cu
+++ b/aten/src/ATen/native/cuda/SortingKthValue.cu
@ -177,7 +177,7 @@ void kthvalue_cuda_template(
  AT_CUDA_CHECK(cudaGetLastError());
 }

-// this does not reduce to median with dim beause we don't want to copy twice
+// this does not reduce to median with dim because we don't want to copy twice
 template <typename scalar_t>
 Tensor median_cuda_template(const Tensor& self) {
  TORCH_CHECK(self.numel() > 0, "median cannot be called with empty tensor");
--- a/aten/src/ATen/native/cuda/TensorFactories.cu
+++ b/aten/src/ATen/native/cuda/TensorFactories.cu
@ -211,7 +211,7 @@ inline int64_t resolve_root_int(
 //                       (row + 2f - 1)row <= 2x
 //                  row^2 + (2f-1)row - 2x <= 0.                            [3]
 //
-// Based on ineuqality [3], we have the following coefficients for formula of
+// Based on inequality [3], we have the following coefficients for formula of
 // root:
 //                               a = 1
 //                               b = 2f - 1
@ -254,7 +254,7 @@ inline void get_coordinate_in_tril_trapezoid(
 //                       (-row + 2f + 1)row <= 2x
 //                   row^2 - (2f+1)row + 2x >= 0.                           [3]
 //
-// Based on ineuqality [3], we have the following coefficients for formula of
+// Based on inequality [3], we have the following coefficients for formula of
 // root:
 //                               a = 1
 //                               b = -1 - 2f
--- a/aten/src/ATen/native/cuda/UpSample.cuh
+++ b/aten/src/ATen/native/cuda/UpSample.cuh
@ -213,7 +213,7 @@ __device__ __forceinline__ static void upsample_increment_value_bounded(
    accscalar_t value) {
  int access_y = max(min(y, height - 1), 0);
  int access_x = max(min(x, width - 1), 0);
-  /* TODO: result here is trucated to scalar_t,
+  /* TODO: result here is truncated to scalar_t,
     check: https://github.com/pytorch/pytorch/pull/19630#discussion_r281426912
   */
  gpuAtomicAdd(
--- a/aten/src/ATen/native/cudnn/RNN.cpp
+++ b/aten/src/ATen/native/cudnn/RNN.cpp
@ -1119,7 +1119,7 @@ std::tuple<Tensor, Tensor> pack_hidden<std::tuple<Tensor, Tensor>>(const Tensor&
 struct DropoutState {
  // Both buffer and event are lazily instantiated when a dropout state is needed
  // for the first time. Note that in this case needed != used, as we don't need
-  // a bufer to e.g. run RNNs in test mode.
+  // a buffer to e.g. run RNNs in test mode.
  at::Tensor buffer;
  c10::optional<cuda::CUDAEvent> event;
  std::mutex mutex;
--- a/aten/src/ATen/native/mkl/SpectralOps.cpp
+++ b/aten/src/ATen/native/mkl/SpectralOps.cpp
@ -99,7 +99,7 @@ static inline void _fft_fill_with_conjugate_symmetry_slice(Tensor& output,
        //   1. if this dim idx becomes 1, will need to add (size - 1) * stride
        //   2. otherwise, will need to subtract stride
        if (from_slice_indices[d] == 0) {
-          // Substract. Carries over to previous dimension
+          // Subtract. Carries over to previous dimension
          from_slice_data -= output.stride(d);
        } else if (from_slice_indices[d] == 1) {
          // Dimension index becomes 1
@ -107,7 +107,7 @@ static inline void _fft_fill_with_conjugate_symmetry_slice(Tensor& output,
          from_slice_data += (output.size(d) - 1) * output.stride(d);
          break;
        } else {
-          // Substract. Doesn't carry over to previous dimension
+          // Subtract. Doesn't carry over to previous dimension
          from_slice_data -= output.stride(d);
          break;
        }
--- a/aten/src/ATen/native/mkldnn/Conv.cpp
+++ b/aten/src/ATen/native/mkldnn/Conv.cpp
@ -43,7 +43,7 @@ using namespace mkldnn;

 namespace {
 // Helper function for getting an ideep tensor out of an aten Tensor.
-// Note in case the aten Tensor is a dense tensor, the retured ideep
+// Note in case the aten Tensor is a dense tensor, the returned ideep
 // tensor is just a view of the storage of the aten dense tensor, so
 // caller needs to make sure the aten dense tensor's lifetime is
 // longer than the ideep tensor.
--- a/aten/src/ATen/native/quantized/cpu/q_adaavgpool.cpp
+++ b/aten/src/ATen/native/quantized/cpu/q_adaavgpool.cpp
@ -23,7 +23,7 @@ inline int start_index(int out_idx, int out_len, int in_len) {
   * in_len: the dimension_size of input matrix
   * Basically, in_len / out_len gives the number of
   * elements in each average computation.
-   * This functin computes the start index on input matrix.
+   * This function computes the start index on input matrix.
   */
  return (int)std::floor((float)(out_idx * in_len) / out_len);
 }
--- a/aten/src/ATen/native/quantized/cpu/qclamp.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qclamp.cpp
@ -23,7 +23,7 @@ Tensor quantized_clamp_impl(
    qclamp_stub(qx.device().type(), qx, *min, *max, qy);
  } else {
    TORCH_CHECK(
-        false, "Both min and max should be specifed for quantized clamp!");
+        false, "Both min and max should be specified for quantized clamp!");
  }
  return qy;
 }
--- a/aten/src/ATen/native/quantized/cpu/qmul.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qmul.cpp
@ -15,7 +15,7 @@ inline void check_inputs(const Tensor& qa, const Tensor& qb) {
  TORCH_CHECK(qa.qscheme() == kPerTensorAffine,
              "Only per tensor quantization is supported in Mul.");
  TORCH_CHECK(qa.qscheme() == qb.qscheme(),
-              "Both inputs to Mul must have the same quantization shceme.");
+              "Both inputs to Mul must have the same quantization scheme.");
  TORCH_CHECK(qa.numel() == qb.numel(),
              "Mul operands must be the same size!");
  TORCH_CHECK(qa.scalar_type() == qb.scalar_type(),
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/requantization/fp32-neon.c
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/requantization/fp32-neon.c
@ -63,7 +63,7 @@ void pytorch_qnnp_requantize_fp32__neon(

 #ifdef __aarch64__
    /*
-     * Leverage "Floating-point Convert to Signed integer, rouding to nearest
+     * Leverage "Floating-point Convert to Signed integer, rounding to nearest
     * with ties to even" instruction. This is an ARMv8 instruction (always
     * available in AArch64), which saturates result on overflow. We don't need
     * to specifically consider saturated results, they will be clamped at the
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/requantization/fp32-psimd.c
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/requantization/fp32-psimd.c
@ -46,7 +46,7 @@ void pytorch_qnnp_requantize_fp32__psimd(
     * - Large int32_t values can't be exactly represented as FP32. We expect
     * that conversion instruction would round it to nearest FP32 value with
     * ties to even, but Clang documentation for __builtin_convertvector does
-     *   not guaratee that.
+     *   not guarantee that.
     * - Product of two FP32 values is generally not exactly representation as
     * an FP32 value, and will be rounded to nearest FP32 value with ties to
     * even.
--- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/requantization/precise-scalar.c
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/requantization/precise-scalar.c
@ -91,7 +91,7 @@ void pytorch_qnnp_requantize_precise__scalar_unsigned32(
     *
     * To avoid full 64-bit shift, we leverage the fact that shift >= 32, and do
     * it in two steps:
-     * - Shift by 32, which can be implemented by extacting the high 32-bit word
+     * - Shift by 32, which can be implemented by extracting the high 32-bit word
     * on 32-bit systems.
     * - Shift by (shift - 32), which can be implemented as a 32-bit shift of
     * high word of addition result.
--- a/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h
@ -11,7 +11,7 @@ struct QnnpackOperatorDeleter {
 };

 // PackedWeight struct for QNNPACK stores the original Weight and Bias as
-// QNNPACK currently does not support an unpack function. Possible optimiation -
+// QNNPACK currently does not support an unpack function. Possible optimization -
 // For PyTorch Mobile, once the model is scripted and serialized we don't need
 // to call unpack, so we can save some memory by checking for this case.
 // Input scale is set to null in pre-pack step. QNNPACK needs bias quantized with
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDATensorMath.cu
@ -61,7 +61,7 @@ Tensor& s_addmm_out_sparse_dense_cuda(Tensor& r_, const Tensor& t, const SparseT
  TORCH_CHECK(cuda::check_device({sparse_, r_, t, dense}));

  TORCH_CHECK(dense.dim() == 2, "addmm: 2D tensor expected, got ", dense.dim(), "D tensor");
-  TORCH_CHECK(sparse_.sparse_dim() == 2, "addmm: expected first two dims to be sparse (indices has size 2 at first dim), but got ", sparse_.sparse_dim(), " spase dims");
+  TORCH_CHECK(sparse_.sparse_dim() == 2, "addmm: expected first two dims to be sparse (indices has size 2 at first dim), but got ", sparse_.sparse_dim(), " sparse dims");
  // no need to check dense_dim because dense_dim + sparse_dim = dim

  // mxk * kxn = mxn
--- a/aten/src/TH/THGeneral.h.in
+++ b/aten/src/TH/THGeneral.h.in
@ -33,7 +33,7 @@
 // we should merge macros.
 #ifdef _WIN32
 #if !defined(AT_CORE_STATIC_WINDOWS)
-// TODO: unfiy the controlling macros.
+// TODO: unify the controlling macros.
 #if defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
 #define TH_CPP_API __declspec(dllexport)
 #else // defined(CAFFE2_BUILD_MAIN_LIBS) || defined(ATen_cpu_EXPORTS) || defined(caffe2_EXPORTS)
--- a/aten/src/TH/THStorage.h
+++ b/aten/src/TH/THStorage.h
@ -1,4 +1,4 @@
 #pragma once
 #include <TH/THStorageFunctions.h>

-// Compatability header. Use THStorageFunctions.h instead if you need this.
+// Compatibility header. Use THStorageFunctions.h instead if you need this.
--- a/aten/src/TH/THTensor.hpp
+++ b/aten/src/TH/THTensor.hpp
@ -42,7 +42,7 @@ inline THStorage* THTensor_getStoragePtr(const THTensor* tensor) {

 // [NOTE: nDimension vs nDimensionLegacyNoScalars vs nDimensionLegacyAll]
 // nDimension                 corresponds to the "true" ATen dimension.
-// nDimensionLegacyNoScalars  correpsonds to the ATen dimension, except scalars are viewed as 1-dimensional tensors.
+// nDimensionLegacyNoScalars  corresponds to the ATen dimension, except scalars are viewed as 1-dimensional tensors.
 // nDimensionLegacyAll        corresponds to the ATen dimension, except scalars are viewed as 1-dimensional tensors
 //                            and tensors with a dimension of size zero are collapsed to 0-dimensional tensors.
 //
--- a/aten/src/TH/generic/THVectorDispatch.cpp
+++ b/aten/src/TH/generic/THVectorDispatch.cpp
@ -197,7 +197,7 @@ void THVector_(normal_fill)(scalar_t *data,
 }

 /*
- * This struct's constructor initalizes the dispatch tables. It simply checks
+ * This struct's constructor initializes the dispatch tables. It simply checks
 * what SIMD extensions are available, and then walks the dispatch table
 * to choose the best function.
 * NOTE: As implemented, it will initialize the dispatch pointer to the first supported function.
--- a/aten/src/THC/THCIntegerDivider.cuh
+++ b/aten/src/THC/THCIntegerDivider.cuh
@ -6,7 +6,7 @@
 #include <cuda_runtime.h>
 #endif

-// A utility class to implement integer division by muliplication, given a fixed
+// A utility class to implement integer division by multiplication, given a fixed
 // divisor.
 //
 // WARNING: The fast divider algorithm is only implemented for unsigned int;
--- a/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu
+++ b/aten/src/THCUNN/generic/SpatialDepthwiseConvolution.cu
@ -41,7 +41,7 @@ void THNN_(SpatialDepthwiseConvolution_updateOutput)(
  weight = THCTensor_(newContiguous)(state, weight);
  bias = bias ? THCTensor_(newContiguous)(state, bias) : bias;

-  // Following the behvaior of other THCUNN functions, we shape the output
+  // Following the behavior of other THCUNN functions, we shape the output
  // Tensor ourselves

  int batchSize = input->size(0);
--- a/benchmarks/framework_overhead_benchmark/C2Module.py
+++ b/benchmarks/framework_overhead_benchmark/C2Module.py
@ -14,7 +14,7 @@ class C2SimpleNet(object):
    """
    This module constructs a net with 'op_name' operator. The net consist
    a series of such operator.
-    It intializes the workspace with input blob equal to the number of parameters
+    It initializes the workspace with input blob equal to the number of parameters
    needed for the op.
    Provides forward method to run the net niter times.
    """
--- a/benchmarks/operator_benchmark/README.md
+++ b/benchmarks/operator_benchmark/README.md
@ -37,7 +37,7 @@ List all the supported tests:
 $ python -m pt.add_test --list_tests
 ```

-Filter and run a test (use `add_M8_N16_K32` as an exapmle):
+Filter and run a test (use `add_M8_N16_K32` as an example):
 ```
 $ python -m pt.add_test --test_name add_K32_M8_N1
 --omp_num_threads 1 --mkl_num_threads 1
--- a/benchmarks/operator_benchmark/benchmark_caffe2.py
+++ b/benchmarks/operator_benchmark/benchmark_caffe2.py
@ -145,7 +145,7 @@ OpMeta = namedtuple("OpMeta", "op_type num_inputs input_dims input_types \

 def generate_c2_test_from_ops(ops_metadata, bench_op, tags):
    """
-    This function is used to generate Caffe2 tests based on the meatdata
+    This function is used to generate Caffe2 tests based on the metadata
    of operators. The metadata includes seven fields which are 1) op_type:
    the name of the operator. 2) num_inputs: the number of input blobs.
    3) input_dims: a dictionary which includes the shapes of the input blobs.
--- a/benchmarks/operator_benchmark/benchmark_core.py
+++ b/benchmarks/operator_benchmark/benchmark_core.py
@ -93,7 +93,7 @@ def _build_test(configs, bench_op, OperatorTestCase, run_backward, op_name_funct
                tags = attr["tags"]
                continue

-            # if 'cuda' is sepcified in input shape but the testing machines doesn't
+            # if 'cuda' is specified in input shape but the testing machines doesn't
            # support, we will skip this input
            if 'cuda' in attr.values():
                if not torch.cuda.is_available():
--- a/benchmarks/operator_benchmark/pt/qrnn_test.py
+++ b/benchmarks/operator_benchmark/pt/qrnn_test.py
@ -52,7 +52,7 @@ class LSTMBenchmark(op_bench.TorchBenchmarkBase):

        self.x = torch.randn(sequence_len,  # sequence length
                             batch_size,    # batch size
-                             I)             # Number of featues in X
+                             I)             # Number of features in X
        self.h = torch.randn(NL * (D + 1),  # layer_num * dir_num
                             batch_size,    # batch size
                             H)             # hidden size
--- a/binaries/convert_and_benchmark.cc
+++ b/binaries/convert_and_benchmark.cc
@ -62,7 +62,7 @@ C10_DEFINE_string(
    "Report the conversion stage time to screen. "
    "The format of the string is <type>|<identifier>. "
    "The valid type is 'json'. "
-    "The valid identifier is nothing or an identifer that prefix every line");
+    "The valid identifier is nothing or an identifier that prefix every line");
 C10_DEFINE_string(
    scale,
    "-1,-1",
--- a/binaries/convert_image_to_tensor.cc
+++ b/binaries/convert_image_to_tensor.cc
@ -63,7 +63,7 @@ C10_DEFINE_string(
    "Report the conversion stage time to screen. "
    "The format of the string is <type>|<identifier>. "
    "The valid type is 'json'. "
-    "The valid identifier is nothing or an identifer that prefix every line");
+    "The valid identifier is nothing or an identifier that prefix every line");
 C10_DEFINE_string(
    scale,
    "-1,-1",
--- a/c10/core/StorageImpl.h
+++ b/c10/core/StorageImpl.h
@ -203,7 +203,7 @@ struct C10_API StorageImpl final : public c10::intrusive_ptr_target {
    data_ptr_ = std::move(data_ptr);
    // NOTE: data_type might change and so it's also possible that capacity
    // might not be divisible by itemsize. There is no way for us to keep track
-    // of the exact capacity if we're not explicity storing is. More conrectely
+    // of the exact capacity if we're not explicitly storing is. More concretely
    // capacity() might not return the value that was set here, if itemsize does
    // not evenly divide it.
    numel_ = capacity / data_type_.itemsize();
--- a/c10/test/util/registry_test.cpp
+++ b/c10/test/util/registry_test.cpp
@ -5,7 +5,7 @@
 #include <c10/util/Registry.h>

 // Note: we use a different namespace to test if the macros defined in
-// Registry.h actuall works with a different namespace from c10.
+// Registry.h actually works with a different namespace from c10.
 namespace c10_test {

 class Foo {
--- a/caffe2/contrib/gloo/allreduce_ops.cc
+++ b/caffe2/contrib/gloo/allreduce_ops.cc
@ -10,7 +10,7 @@

 namespace {
 /**
- * This is a helper function which attemtps to get a base value depending on the
+ * This is a helper function which attempts to get a base value depending on the
 * # of nodes. Larger the base the better performance (up to 4) is what we have
 * observed in gloo benchmarks. At the moment bcube works only if # nodes = base
 * ^ x. Where x is some constant. So, if # node don't match our expectation
--- a/caffe2/contrib/gloo/allreduce_ops.h
+++ b/caffe2/contrib/gloo/allreduce_ops.h
@ -63,7 +63,7 @@ class AllreduceOp final : public Operator<Context> {
    // Store which inputs/outputs this instance initialized with
    update(init_);

-    // Verify inputs == ouputs
+    // Verify inputs == outputs
    CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size());
    for (auto i = 0; i < init_.inputs.size(); i++) {
      CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]);
--- a/caffe2/contrib/gloo/allreduce_ops_gpu.cc
+++ b/caffe2/contrib/gloo/allreduce_ops_gpu.cc
@ -37,7 +37,7 @@ std::unique_ptr<::gloo::Algorithm> initializeAlgorithm(
 }

 /**
- * This is a helper function which attemtps to get a base value depending on the
+ * This is a helper function which attempts to get a base value depending on the
 * # of nodes. Larger the base the better performance (up to 4) is what we have
 * observed in gloo benchmarks. At the moment bcube works only if # nodes = base
 * ^ x. Where x is some constant. So, if # node don't match our expectation
--- a/caffe2/contrib/gloo/broadcast_ops.h
+++ b/caffe2/contrib/gloo/broadcast_ops.h
@ -58,7 +58,7 @@ class BroadcastOp final : public Operator<Context> {
    // Store which inputs/outputs this instance initialized with
    update(init_);

-    // Verify inputs == ouputs
+    // Verify inputs == outputs
    CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size());
    for (auto i = 0; i < init_.inputs.size(); i++) {
      CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]);
--- a/caffe2/contrib/gloo/reduce_scatter_ops.h
+++ b/caffe2/contrib/gloo/reduce_scatter_ops.h
@ -73,7 +73,7 @@ class ReduceScatterOp final : public Operator<Context> {
    // Store which inputs/outputs this instance initialized with
    update(init_);

-    // Verify inputs == ouputs
+    // Verify inputs == outputs
    CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size());
    for (auto i = 0; i < init_.inputs.size(); i++) {
      CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]);
--- a/caffe2/contrib/opencl/OpenCL/cl.hpp
+++ b/caffe2/contrib/opencl/OpenCL/cl.hpp
@ -68,7 +68,7 @@
 * The following example shows a general use case for the C++
 * bindings, including support for the optional exception feature and
 * also the supplied vector and string classes, see following sections for
- * decriptions of these features.
+ * descriptions of these features.
 *
 * \code
 * #define __CL_ENABLE_EXCEPTIONS
--- a/caffe2/contrib/playground/checkpoint.py
+++ b/caffe2/contrib/playground/checkpoint.py
@ -108,7 +108,7 @@ def broadcast_parameters(opts, model, num_xpus, broadcast_computed_param=False):
        else caffe2_pb2.CPU
    for params in all_params:
        assert len(params) % num_xpus == 0, \
-            "Current model dosen't match device number when loading checkpoint"
+            "Current model doesn't match device number when loading checkpoint"
        params_per_xpu = int(len(params) / num_xpus)
        for idx in range(params_per_xpu):
            blobs = [param for param in params[idx::params_per_xpu]]
--- a/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
+++ b/caffe2/contrib/tensorrt/tensorrt_tranformer.cc
@ -507,7 +507,7 @@ void TensorRTTransformer::Transform(
        return true;
      };

-  // function to convert runnbale subgraph into a trt op. Note that to keep the
+  // function to convert runnable subgraph into a trt op. Note that to keep the
  // interface clean, we do the double conversion from C2 op to Onnx ops here
  // but it should be OK as the cost is really small. We also need to keep the
  // same exporter throughout the process to avoid duplicated dummy name
--- a/caffe2/core/blob_serialization.h
+++ b/caffe2/core/blob_serialization.h
@ -38,7 +38,7 @@ CAFFE2_API void SerializeBlob(
 /**
 * @brief Convenience function to serialize a blob to a string.
 *
- * This is a conveinence function to serialize small Blobs that produce
+ * This is a convenience function to serialize small Blobs that produce
 * manageable serialized strings. To serialize big blobs such as
 * large sparse tensors, use the fully-functional interface in
 * blob_serializer_base.h.
--- a/caffe2/core/common.h
+++ b/caffe2/core/common.h
@ -92,7 +92,7 @@ inline Dst dynamic_cast_if_rtti(Src ptr) {
 }

 // SkipIndices are used in operator_fallback_gpu.h and operator_fallback_mkl.h
-// as utilty functions that marks input / output indices to skip when we use a
+// as utility functions that marks input / output indices to skip when we use a
 // CPU operator as the fallback of GPU/MKL operator option.
 template <int... values>
 class SkipIndices {
--- a/caffe2/core/context_gpu.cu
+++ b/caffe2/core/context_gpu.cu
@ -174,7 +174,7 @@ std::unique_ptr<cub::CachingDeviceAllocator> g_cub_allocator;
 static std::unordered_map<void*, uint8_t> g_cuda_device_affiliation;

 // Data structures for optional memory tracking. Access to these structures
-// is garded by the CUDAContext::mutex.
+// is guarded by the CUDAContext::mutex.
 static std::unordered_map<void*, long> g_size_map;
 static std::vector<long> g_total_by_gpu_map(C10_COMPILE_TIME_MAX_GPUS, 0);
 static std::vector<long> g_max_by_gpu_map(C10_COMPILE_TIME_MAX_GPUS, 0);
--- a/caffe2/core/memonger.cc
+++ b/caffe2/core/memonger.cc
@ -471,7 +471,7 @@ class ComputeBlobRecyclingForDag {
    }
  }

-  // Rturns true if the op that generates that blob acquires all tokens.
+  // Returns true if the op that generates that blob acquires all tokens.
  inline bool can_use_blob(
      const string& blob_name,
      std::unordered_set<int>* tokens,
--- a/caffe2/core/net.h
+++ b/caffe2/core/net.h
@ -76,7 +76,7 @@ class CAFFE2_API NetBase : public Observable<NetBase> {
   * seconds spent during the benchmark. The 0-th item is the time spent per
   * each network run, and if a net instantiation supports run_individual,
   * the remainder of the vector returns the number of milliseconds spent per
-   * opeartor.
+   * operator.
   */
  virtual vector<float> TEST_Benchmark(
      const int /*warmup_runs*/,
--- a/caffe2/core/net_async_tracing.cc
+++ b/caffe2/core/net_async_tracing.cc
@ -461,7 +461,7 @@ std::shared_ptr<Tracer> create(
    const std::string& net_name) {
  // Enable the tracer if the net has the "enable_tracing" argument set OR
  // if the command line option includes the net name option in the list of
-  // tracable nets.
+  // traceable nets.
  bool trace_net = hasEnableTracingFlag(net) || isTraceableNetName(net_name);
  return trace_net
      ? std::make_shared<Tracer>(net, net_name, getTracingConfigFromNet(net))
--- a/caffe2/core/net_simple_refcount.cc
+++ b/caffe2/core/net_simple_refcount.cc
@ -24,7 +24,7 @@ SimpleRefCountNet::SimpleRefCountNet(

  std::map<string, int> last_consumed_at;
  std::set<string> created_by_me;
-  // For each opeartor
+  // For each operator
  for (int idx = 0; idx < net_def->op_size(); ++idx) {
    const auto& op_def = net_def->op(idx);
    for (const string& in_name : op_def.input()) {
--- a/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
+++ b/caffe2/core/nomnigraph/include/nomnigraph/Representations/NeuralNet.h
@ -254,7 +254,7 @@ struct CAFFE2_API NNModule {
  NNModule(NNModule&&) = default;
  NNModule() {}

-  /* Repalce subgraph sg by node, using the order of
+  /* Replace subgraph sg by node, using the order of
   * node_inputs and node_outputs to determine how to link
   * them to the node.  node_inputs *must* enumerate all the
   * inputs to the subgraph (NeuralNetData that do not
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@ -645,7 +645,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
  std::string type_;
  vector<const Blob*> inputs_;
  vector<Blob*> outputs_;
-  // Preferrably use c10::optional, but nvcc doesn't work
+  // Preferably use c10::optional, but nvcc doesn't work
 #if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
  std::unique_ptr<const c10::FunctionSchema> fn_schema_;
  vector<c10::IValue> newstyle_inputs_;
--- a/caffe2/core/operator_schema.h
+++ b/caffe2/core/operator_schema.h
@ -131,7 +131,7 @@ class CAFFE2_API OpSchema {
  OpSchema& AllowInplace(std::function<bool(int, int)> inplace);
  OpSchema& AllowInplace(set<std::pair<int, int>> inplace);
  OpSchema& AllowOneToOneInplace();
-  // Sets the rule to enforce in-place opeartion.
+  // Sets the rule to enforce in-place operation.
  OpSchema& EnforceInplace(std::function<bool(int, int)> inplace);
  OpSchema& EnforceInplace(set<std::pair<int, int>> inplace);
  OpSchema& EnforceOneToOneInplace();
--- a/caffe2/core/scope_guard.h
+++ b/caffe2/core/scope_guard.h
@ -112,7 +112,7 @@ using ScopeGuardImplDecay = ScopeGuardImpl<typename std::decay<F>::type>;
 /**
 * ScopeGuard is a general implementation of the "Initialization is
 * Resource Acquisition" idiom.  Basically, it guarantees that a function
- * is executed upon leaving the currrent scope unless otherwise told.
+ * is executed upon leaving the current scope unless otherwise told.
 *
 * The MakeGuard() function is used to create a new ScopeGuard object.
 * It can be instantiated with a lambda function, a std::function<void()>,
--- a/caffe2/core/static_tracepoint_elfx86.h
+++ b/caffe2/core/static_tracepoint_elfx86.h
@ -32,7 +32,7 @@
 #define CAFFE_SDT_ARGSIZE(x)  (CAFFE_SDT_ISARRAY(x) ? sizeof(void*) : sizeof(x))

 // Format of each probe arguments as operand.
-// Size of the arugment tagged with CAFFE_SDT_Sn, with "n" constraint.
+// Size of the argument tagged with CAFFE_SDT_Sn, with "n" constraint.
 // Value of the argument tagged with CAFFE_SDT_An, with configured constraint.
 #define CAFFE_SDT_ARG(n, x)                                                    \
  [CAFFE_SDT_S##n] "n"                ((size_t)CAFFE_SDT_ARGSIZE(x)),          \
--- a/caffe2/core/workspace.h
+++ b/caffe2/core/workspace.h
@ -278,7 +278,7 @@ class CAFFE2_API Workspace {
               ShouldContinue should_continue = StopOnSignal{});

  /*
-   * Returns a CPU threadpool instace for parallel execution of
+   * Returns a CPU threadpool instance for parallel execution of
   * work. The threadpool is created lazily; if no operators use it,
   * then no threadpool will be created.
   */
--- a/caffe2/experiments/python/net_construct_bench.py
+++ b/caffe2/experiments/python/net_construct_bench.py
@ -31,7 +31,7 @@ import caffe2.python.models.resnet as resnet

 '''
 Simple benchmark that creates a data-parallel resnet-50 model
-and measurs the time.
+and measures the time.
 '''


--- a/caffe2/image/image_input_op.h
+++ b/caffe2/image/image_input_op.h
@ -1023,7 +1023,7 @@ void TransformImage(
  ColorNormalization<Context>(image_data, crop, channels, mean, std);
 }

-// Only crop / transose the image
+// Only crop / transpose the image
 // leave in uint8_t dataType
 template <class Context>
 void CropTransposeImage(
--- a/caffe2/mobile/contrib/libopencl-stub/include/CL/cl.hpp
+++ b/caffe2/mobile/contrib/libopencl-stub/include/CL/cl.hpp
@ -68,7 +68,7 @@
 * The following example shows a general use case for the C++
 * bindings, including support for the optional exception feature and
 * also the supplied vector and string classes, see following sections for
- * decriptions of these features.
+ * descriptions of these features.
 *
 * \code
 * #define __CL_ENABLE_EXCEPTIONS
--- a/caffe2/mobile/contrib/nnapi/nnapi.cc
+++ b/caffe2/mobile/contrib/nnapi/nnapi.cc
@ -56,7 +56,7 @@ bool NNApi::run(const TensorVector& inputs, TensorVector* outputs) {
  try {
    init(inputs, outputs);
  } catch (const std::exception& e) {
-    LOG(ERROR) << "Error duing model initialization: " << e.what();
+    LOG(ERROR) << "Error during model initialization: " << e.what();
    return false;
  }

--- a/caffe2/onnx/backend.cc
+++ b/caffe2/onnx/backend.cc
@ -1657,7 +1657,7 @@ Caffe2BackendRep* Caffe2Backend::Prepare(
    }
  }

-  // TODO: avoid extra copy by directly feed initialiers to backend blobs
+  // TODO: avoid extra copy by directly feed initializers to backend blobs
  OnnxToCaffe2(
      &rep->init_net(),
      &rep->pred_net(),
--- a/caffe2/onnx/onnx_exporter.cc
+++ b/caffe2/onnx/onnx_exporter.cc
@ -185,7 +185,7 @@ void ssaRewriteForIfOp(
    OperatorDef* op,
    std::unordered_map<std::string, int>* blob_versions,
    std::set<std::string>* is_initialized_tensor) {
-  // Get all the "external" inputs and outpus of the subnet
+  // Get all the "external" inputs and outputs of the subnet
  // Since then_net and else_net has same external input/output, we only collect
  // external input/output from one of its subnet And perform the rewrite to
  // both then_net and else_net
--- a/caffe2/onnx/onnx_exporter.h
+++ b/caffe2/onnx/onnx_exporter.h
@ -111,7 +111,7 @@ class CAFFE2_API OnnxExporter {
      const caffe2::OperatorDef& def,
      const std::unordered_map<std::string, caffe2::TensorShape>& shapes);

-  // \brief Check black listed arguemnts where we won't pass down when
+  // \brief Check black listed arguments where we won't pass down when
  // converting to ONNX node
  bool IsBlackListed(const caffe2::Argument& arg);

--- a/caffe2/onnx/torch_ops/defs.cc
+++ b/caffe2/onnx/torch_ops/defs.cc
@ -138,7 +138,7 @@ ONNX_PYTORCH_OPERATOR_SET_SCHEMA(
    OpSchema()
        .SetDoc("Mirror Caffe2 BatchMatMul operator")
        .Input(0, "X", "tensor of shape (dim0, dim1 ... M, K)", "T")
-        .Input(1, "Y", "tensor of shpae (dim0, dim2 ... K, N)", "T")
+        .Input(1, "Y", "tensor of shape (dim0, dim2 ... K, N)", "T")
        .Output(0, "Z", "tensor of shape (dim0, dim1 ... M, N)", "T")
        .TypeConstraint(
            "T",
--- a/caffe2/operators/activation_ops_cudnn.h
+++ b/caffe2/operators/activation_ops_cudnn.h
@ -31,7 +31,7 @@ class CuDNNActivationOpBase : public Operator<CUDAContext> {
      const cudnnDataType_t data_type,
      const int data_size) {
    if (data_size != input_size_) {
-      // Since the best performance is obtained when the tesor is HW-packed, we
+      // Since the best performance is obtained when the tensor is HW-packed, we
      // put X.size() to W.
      input_size_ = data_size;
      CUDNN_ENFORCE(cudnnSetTensor4dDescriptor(
--- a/caffe2/operators/batch_bucketize_op.cc
+++ b/caffe2/operators/batch_bucketize_op.cc
@ -69,7 +69,7 @@ The lengths is a 1D tensor that splits the following 'boundaries' argument.
 The boundaries is a 1D tensor containing the border list for each feature.

 With in each batch, `indices` should not have duplicate number,
-and the number of elements in `indices` should be less than or euqal to `D`.
+and the number of elements in `indices` should be less than or equal to `D`.
 Each element in `lengths` vector (lengths[`i`]) represents
 the number of boundaries in the sub border list.
 The sum of all elements in `lengths` must be equal to the size of  `boundaries`.
--- a/caffe2/operators/batch_matmul_op.cc
+++ b/caffe2/operators/batch_matmul_op.cc
@ -126,7 +126,7 @@ OPERATOR_SCHEMA(BatchMatMul)
 Batch Matrix multiplication Yi = Ai * Bi, where A has shape (dim0, dim1, ... M, K),
 B has shape (dim0, dim1, ... K, N), Y has shape (dim0, dim1, ... M, N) and i ranges
 from 0 to (dim0 * dim1 ...) - 1. rank(A) == rank(B) >= 2. In case of A and B being
-two diemnsional, it behaves like normal matrix multiplication.
+two dimensional, it behaves like normal matrix multiplication.
 )DOC")
    .Input(0, "A", "tensor of shape (dim0, dim1 ... M, K)")
    .Input(1, "B", "tensor of shape (dim0, dim1 ... K, N)")
--- a/caffe2/operators/bisect_percentile_op.cc
+++ b/caffe2/operators/bisect_percentile_op.cc
@ -46,7 +46,7 @@ OPERATOR_SCHEMA(BisectPercentile)
    R_2 = [0.3, 1.2];
    We will build R = [0.1, 0.4, 0.5, 0.3, 1.2]; besides, we have
    lengths = [3, 2]
-    to indicate the boundries of the percentile information.
+    to indicate the boundaries of the percentile information.

 )DOC")
    .Arg(
--- a/caffe2/operators/box_with_nms_limit_op.h
+++ b/caffe2/operators/box_with_nms_limit_op.h
@ -51,7 +51,7 @@ class BoxWithNMSLimitOp final : public Operator<Context> {
        "Unexpected soft_nms_method");
    soft_nms_method_ = (soft_nms_method_str_ == "linear") ? 1 : 2;

-    // When input `boxes` doesn't inlcude background class, the score will skip
+    // When input `boxes` doesn't include background class, the score will skip
    // background class and start with foreground classes directly, and put the
    // background class in the end, i.e. score[:, 0:NUM_CLASSES-1] represents
    // foreground classes and score[:,NUM_CLASSES] represents background class.
--- a/caffe2/operators/conv_op_cudnn.cc
+++ b/caffe2/operators/conv_op_cudnn.cc
@ -97,7 +97,7 @@ class CudnnConvOpBase : public ConvPoolOpBase<CUDAContext> {
  }

 protected:
-  // A helper function to set up the tensor Nd desriptor, depending on the order
+  // A helper function to set up the tensor Nd descriptor, depending on the order
  // the group and the type given.
  template <typename T>
  void SetTensorNdDescriptorWithGroup(
--- a/caffe2/operators/crf_viterbi_op.cc
+++ b/caffe2/operators/crf_viterbi_op.cc
@ -209,7 +209,7 @@ OPERATOR_SCHEMA(SwapBestPath)
    .NumInputs(2)
    .NumOutputs(1)
    .SetDoc(R"DOC(
-Given a sequence of idices and a matrix, enforce that these indices have the
+Given a sequence of indices and a matrix, enforce that these indices have the
 best columnwise scores
 score
 )DOC")
--- a/caffe2/operators/fused_rowwise_random_quantization_ops.cc
+++ b/caffe2/operators/fused_rowwise_random_quantization_ops.cc
@ -170,7 +170,7 @@ In Advances in Neural Information Processing Systems, pp. 1508-1518. 2017.
 )DOC")
    .Input(0, "input", "Float32 input data")
    .Output(0, "output", "Fused bitwidth, tail, min, max and quantized data")
-    .Arg("bitwidth", "How many bits to quantiz per data (defaults to 8).")
+    .Arg("bitwidth", "How many bits to quantize per data (defaults to 8).")
    .Arg("random", "random or not (True). False is set up for unittest.");
 NO_GRADIENT(FloatToFusedRandRowwiseQuantized);

--- a/caffe2/operators/gather_op.h
+++ b/caffe2/operators/gather_op.h
@ -184,7 +184,7 @@ class GatherOp : public Operator<Context> {
    // an error.
    // Right now, we apply index wrapping by default only to axis == 0,
    // since we have ONNX conversion code that uses it. For other ops it
-    // needs to be speified explicitly with argument or you don't get it.
+    // needs to be specified explicitly with argument or you don't get it.
    if (OperatorBase::HasArgument("wrap_indices")) {
      wrap_indices_ = Operator<Context>::template GetSingleArgument<bool>(
          "wrap_indices", (false));
--- a/caffe2/operators/generate_proposals_op.h
+++ b/caffe2/operators/generate_proposals_op.h
@ -69,7 +69,7 @@ CAFFE2_API ERArrXXf ComputeSortedAnchors(
 } // namespace utils

 // C++ implementation of GenerateProposalsOp
-// Generate bounding box proposals for Faster RCNN. The propoasls are generated
+// Generate bounding box proposals for Faster RCNN. The proposals are generated
 //     for a list of images based on image score 'score', bounding box
 //     regression result 'deltas' as well as predefined bounding box shapes
 //     'anchors'. Greedy non-maximum suppression is applied to generate the
--- a/caffe2/operators/h_softmax_op.cc
+++ b/caffe2/operators/h_softmax_op.cc
@ -632,7 +632,7 @@ search tree.
    .Arg("topN", "Number of nodes in outputs")
    .Input(0, "X", "Input data from previous layer")
    .Input(1, "W", "The matrix trained from Softmax Ops")
-    .Input(2, "b", "The bias traiend from Softmax Ops")
+    .Input(2, "b", "The bias trained from Softmax Ops")
    .Output(
        0,
        "Y_names",
--- a/caffe2/operators/heatmap_max_keypoint_op.cc
+++ b/caffe2/operators/heatmap_max_keypoint_op.cc
@ -140,7 +140,7 @@ bool HeatmapMaxKeypointOp<float, CPUContext>::RunOnDevice() {
      }
      assert(std::abs(delta(0)) <= MAX_DELTA);
      assert(std::abs(delta(1)) <= MAX_DELTA);
-      // find maximum of detla scores
+      // find maximum of delta scores
      keypoints(k, 0 * keypoint_count + j) =
          x0 + (0.5 + maxX + delta(0)) * xLen / heatmap_size;
      keypoints(k, 1 * keypoint_count + j) =
--- a/caffe2/operators/lengths_reducer_rowwise_8bit_ops.h
+++ b/caffe2/operators/lengths_reducer_rowwise_8bit_ops.h
@ -74,7 +74,7 @@ class SparseLengths8BitsRowwiseOp : public Operator<Context> {
        in_block_size,
        outputSize,
        indices_size,
-        N, // embeding table length
+        N, // embedding table length
        input_data,
        indices,
        lengths,
--- a/caffe2/operators/load_save_op_util.cc
+++ b/caffe2/operators/load_save_op_util.cc
@ -27,7 +27,7 @@ void ProcessBlob(
  auto& blob_states = *blob_states_ptr;
  if (blob_states.count(key) == 0) {
    // We reset the blob so that any existing content is destroyed. This
-    // is to guaranee correct device placement: if we are deserializing
+    // is to guarantee correct device placement: if we are deserializing
    // into a TensorCUDA, without explicit Reset we might be loading data
    // into an existing TensorCUDA that has pre-allocated memory on a
    // different GPU.
--- a/caffe2/operators/op_utils_cudnn.h
+++ b/caffe2/operators/op_utils_cudnn.h
@ -46,7 +46,7 @@ inline void LogCuDNNPerfStats(

 // Easier indexing into force_algo_ vector,
 // shared by CudnnConvTransposeOpBase and CudnnConvOpBase to force
-// usage of a particular algortihm instead of searching
+// usage of a particular algorithm instead of searching
 enum { ALGO_FWD = 0, ALGO_WGRAD = 1, ALGO_DGRAD = 2 };

 } // namespace caffe2
--- a/Show More
+++ b/Show More