From 69aa6b4bb92b54cf24568236d35009a476ee906f Mon Sep 17 00:00:00 2001
From: Kazuaki Ishizaki <ishizaki@jp.ibm.com>
Date: Mon, 6 Mar 2023 18:05:10 +0000
Subject: [PATCH] fix typo in comments under torch/csrc/autograd (#96061)

This PR fixes typos in comments of `.cpp` and `.h` files under `torch/csrc/autograd` directory
Pull Request resolved: https://github.com/pytorch/pytorch/pull/96061
Approved by: https://github.com/soulitzer
---
 torch/csrc/autograd/FunctionsManual.cpp       | 34 +++++++++----------
 torch/csrc/autograd/VariableTypeUtils.h       |  2 +-
 torch/csrc/autograd/autograd_meta.cpp         |  2 +-
 .../autograd_not_implemented_fallback.cpp     |  4 +--
 torch/csrc/autograd/custom_function.cpp       |  2 +-
 torch/csrc/autograd/engine.cpp                |  6 ++--
 torch/csrc/autograd/function.h                |  2 +-
 .../csrc/autograd/functions/accumulate_grad.h |  2 +-
 torch/csrc/autograd/graph_task.h              |  2 +-
 torch/csrc/autograd/input_buffer.cpp          |  2 +-
 torch/csrc/autograd/profiler_kineto.h         |  2 +-
 torch/csrc/autograd/profiler_legacy.cpp       |  4 +--
 torch/csrc/autograd/saved_variable.cpp        |  2 +-
 torch/csrc/autograd/saved_variable.h          |  2 +-
 torch/csrc/autograd/variable.cpp              |  4 +--
 15 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp
index 5e53972efa69..3a5d1bdc8623 100644
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@@ -1684,7 +1684,7 @@ Tensor cholesky_jvp(const Tensor& dA, const Tensor& L, bool upper) {
   // L^{-1}dA(L^{-H}) = L^{-1}dL + (L^{-1}dL)^H
   //               = sym(L^{-1}dL)
   // where sym(X) = X + X^H
-  // A short computaiton gives that the inverse of sym is given by
+  // A short computation gives that the inverse of sym is given by
   // \pi(X) = X.tril() - 0.5*diag(X)
   // so
   // dL = L\pi(L^{-1}dA(L^{-H}))
@@ -1787,8 +1787,8 @@ Tensor cholesky_inverse_jvp(
 // of Ap^i, A^j, dA^k with i, j, k in {1, H}, where X^H = X.mH(). To prove that,
 // note (A Ap)^H = A Ap and (Ap A)^H = Ap A, which could be shown by taking the
 // product between the SVD decompositions of A and Ap. Consider the
-// conjugate-tranposed [2]: (A Ap A)^H = A^H (A Ap) = A^H. By differentiating it
-// we get: dA^H A Ap + A^H dA Ap + A^H A dAp = dA^H. By multiplying from the
+// conjugate-transposed [2]: (A Ap A)^H = A^H (A Ap) = A^H. By differentiating
+// it we get: dA^H A Ap + A^H dA Ap + A^H A dAp = dA^H. By multiplying from the
 // left by Ap^H and using Ap^H A^H = (A Ap)^H = A Ap: Ap^H dA^H A Ap + A Ap dA
 // Ap + A Ap A dAp = Ap^H dA^H. By multiplying from the left by Ap and by
 // applying [1] and [2] repeatedly until impossible we get: Ap Ap^H dA^H A Ap +
@@ -2368,7 +2368,7 @@ Tensor softplus_double_backward(
 //           this later)
 //   4. Return the as_strided view of the storage tensor using input geometry.
 //
-// In step (2), if the output tensor does't have overlapping memory, we can
+// In step (2), if the output tensor doesn't have overlapping memory, we can
 // safely scatter (`storage.as_strided(output_geometry).copy_(grad)`);
 // otherwise, we must use `index_add` as gradients at different indices may need
 // to be summed to a single location.
@@ -2501,12 +2501,12 @@ Tensor softplus_double_backward(
 //
 //        Note that all values in `S(n)` are the same (they point to the same
 //        memory location anyways, so this step doesn't change anything, but
-//        effectively avoids having the denpendency on the layout of `input`.
+//        effectively avoids having the dependency on the layout of `input`.
 //        I.e., the result holds fixed regardless of the layout of `input`, as
 //        long as `input_stride` is fixed.
 //
-//      NOTE: for forward pass, we can equivalently simply selet any one of
-//            `S(n)` as `storage[n]`. However, cosnidering this as an average
+//      NOTE: for forward pass, we can equivalently simply select any one of
+//            `S(n)` as `storage[n]`. However, considering this as an average
 //            operation makes backward easier (so all values in set
 //            `{ grad_input[i] : i in S(n) }` are the same, and it can use the
 //            same geometry as input).
@@ -2645,7 +2645,7 @@ Tensor softplus_double_backward(
 //                                stride[B[j]]
 //
 //              Then the invariant is obviously satisfied at every dimension
-//              in this block if it is satisfied at dimnesion B[-1]. It only
+//              in this block if it is satisfied at dimension B[-1]. It only
 //              remains to show that it is satisfied at the last dimension in
 //              each block.
 //
@@ -3212,7 +3212,7 @@ Tensor svd_backward(
   // where CP(n-1) is the complex projective space of dimension n-1.
   // In other words, M is just the complex projective space, and pi is (pretty
   // similar to) the usual principal bundle from S^{2n-1} to CP(n-1). The case k
-  // > 1 is the same, but requiring a linear inependence condition between the
+  // > 1 is the same, but requiring a linear independence condition between the
   // vectors from the different S^{2n-1} or CP(n-1).
   //
   // Note that this is a U(1)^k-bundle. In plain words, this means that the
@@ -3672,14 +3672,14 @@ Tensor linalg_qr_backward(
     const Tensor& Q,
     const Tensor& R,
     const c10::string_view mode) {
-  // Nb. We won't be too formal below, as writing this proof formaly is a pain
+  // Nb. We won't be too formal below, as writing this proof formally is a pain
   // We'll link here a formal writing of all this at some point in the future
   //
   // Case m >= n
   // dQ = dAR^{-1} - Qsyminv(sym(Q^H dA R^{-1}))
   // dR = syminv(sym(Q^H dA R^{-1}))R
   //
-  // With the notation from the JVP formla, the only two computations that we
+  // With the notation from the JVP formula, the only two computations that we
   // need are syminv*(R) = 0.5 * (R.triu() + R.triu()^H - Re diag(R)) sym*(X) =
   // 2 * X Using these, after a few simplifications we get that gA = (gQ +
   // syminvadj(triu(gR R^H - Q^H gQ)))R^{-H}
@@ -4712,14 +4712,14 @@ std::tuple<Tensor, Tensor, Tensor> _trilinear_backward(
 }
 
 Tensor log1p_backward(const Tensor& grad, const Tensor& self) {
-  // We must conditionally initalize this using to_dense if sparse, sparse
+  // We must conditionally initialize this using to_dense if sparse, sparse
   // addition is not supported without exact shape match
   Tensor self_p1_conj;
   if (self.layout() == c10::kSparse || self.layout() == c10::kSparseCsr ||
       self.layout() == c10::kSparseCsc || self.layout() == c10::kSparseBsr ||
       self.layout() == c10::kSparseBsc) {
     // The warning only applies to the sparsity of self, dense grad is never
-    // materialized so if self is strided and grad is sparse nothing unepected
+    // materialized so if self is strided and grad is sparse nothing unexpected
     // happens memory wise
     TORCH_WARN(
         "log1p_backward: received self with sparse layout, but backward requires materialization of a dense tensor with this shape");
@@ -4959,7 +4959,7 @@ std::tuple<Tensor, Tensor> householder_product_backward(
   // better performance
   bool modify_K_in_place = !at::GradMode::is_enabled();
 
-  // This method exploites that at k-th iteration vector v_k has only elements
+  // This method exploits that at k-th iteration vector v_k has only elements
   // v_k[k:] which are non-zero.
   auto update_grad = [&m](
                          int64_t k,
@@ -5217,7 +5217,7 @@ std::tuple<Tensor, Tensor, Tensor> ormqr_backward(
   if (self_requires_grad || tau_requires_grad) {
     if (left ^ transpose) {
       // Assume left = true and transpose = false. The case with
-      // left = false and tranpose = true is very much similar with just
+      // left = false and transpose = true is very much similar with just
       // transposed arguments passed into householder_product_backward.
       // Ormqr computes B = H_1 * ... * H_k * A.
       // The sensivity wrt H_i is given by (see notes in
@@ -6068,7 +6068,7 @@ Tensor gather_with_keepdimed_indices(
 // P^T dA1 = dL U1 + L dU1 => [left-multiply by L^{-1}, right-multiply by
 // U1^{-1}] L^{-1} P^T dA1 U1^{-1} = L^{-1} dL + dU1 U1^{-1} (**). Note, L is
 // lower-triangular, and so is its inverse, hence L^{-1} dL is lower-triangular.
-// Also, since the diagonal of L (all ones) is never exposed explicity (packed
+// Also, since the diagonal of L (all ones) is never exposed explicitly (packed
 // representation), the diagonal of dL is zero, and hence diag(L^{-1} dL) = 0.
 // Assuming that U1 is full-rank, similarly, dU1 U1^{-1} is upper-triangular.
 // Combining these observations we conclude:
@@ -6351,7 +6351,7 @@ Tensor logsumexp_jvp(
     const Tensor& self_t,
     IntArrayRef dim,
     bool keepdim) {
-  // NB: for simplicitly, we recompute some values that can be reused from
+  // NB: for simplicity, we recompute some values that can be reused from
   // forward
   auto self_p_exp = [&self_p, &dim]() {
     if (self_p.sym_numel() > 0) {
diff --git a/torch/csrc/autograd/VariableTypeUtils.h b/torch/csrc/autograd/VariableTypeUtils.h
index 34eda5378721..a308aea3e0a1 100644
--- a/torch/csrc/autograd/VariableTypeUtils.h
+++ b/torch/csrc/autograd/VariableTypeUtils.h
@@ -330,7 +330,7 @@ inline std::vector<at::Tensor> as_view(
         "Non-backward differentiable views must have creation_meta=CreationMeta::DEFAULT");
   }
   if (is_fw_differentiable) {
-    // Check if base is a forward differentiabble view
+    // Check if base is a forward differentiable view
     auto diff_view_meta = torch::autograd::impl::get_view_autograd_meta(base);
     if (diff_view_meta && diff_view_meta->has_fw_view()) {
       const auto& base_fw_info = diff_view_meta->get_forward_view();
diff --git a/torch/csrc/autograd/autograd_meta.cpp b/torch/csrc/autograd/autograd_meta.cpp
index 968bc5139141..900a5e69944b 100644
--- a/torch/csrc/autograd/autograd_meta.cpp
+++ b/torch/csrc/autograd/autograd_meta.cpp
@@ -276,7 +276,7 @@ const Variable& AutogradMeta::fw_grad(
     return ForwardGrad::undef_grad();
   }
 
-  // Ensure that concurent fw_grad() "reads" are thread safe
+  // Ensure that concurrent fw_grad() "reads" are thread safe
   std::lock_guard<std::mutex> lock(mutex_);
 
   const auto& direct_fw_grad =
diff --git a/torch/csrc/autograd/autograd_not_implemented_fallback.cpp b/torch/csrc/autograd/autograd_not_implemented_fallback.cpp
index 890a7fa3e6e9..0c0e07c84ccc 100644
--- a/torch/csrc/autograd/autograd_not_implemented_fallback.cpp
+++ b/torch/csrc/autograd/autograd_not_implemented_fallback.cpp
@@ -187,7 +187,7 @@ void autogradNotImplementedFallbackImpl(
               t.use_count() <= 1, op_name); // Okay to return undefined tensor
         // note(crcrpar): `_foreach_norm` returns a list of scalar Tensors and
         // each Tensor shares a storage of a hidden, intermediate 1D Tensor
-        // created inside the CUDA implemenetation. This is because the
+        // created inside the CUDA implementation. This is because the
         // reference implementation of nvidia/apex repo returns this 1D Tensor
         // where each element represents the norm of corresponding input Tensor,
         // here I want to return the same number of Tensors as the input
@@ -357,7 +357,7 @@ void autogradNotImplementedInplaceOrViewFallbackImpl(
               ? CreationMeta::INFERENCE_MODE
               : (at::GradMode::is_enabled() ? CreationMeta::MULTI_OUTPUT_NODE
                                             : CreationMeta::NO_GRAD_MODE));
-      // ^ pass in creation meta unecessarily even if not isDifferentiableType,
+      // ^ pass in creation meta unnecessarily even if not isDifferentiableType,
       // but we don't have that
       //   information here anyway.
       stack->at(stack->size() - num_returns + aliased_output_idx) = result;
diff --git a/torch/csrc/autograd/custom_function.cpp b/torch/csrc/autograd/custom_function.cpp
index 05b3642c1572..527a87a87aa3 100644
--- a/torch/csrc/autograd/custom_function.cpp
+++ b/torch/csrc/autograd/custom_function.cpp
@@ -29,7 +29,7 @@ Variable VariableInfo::zeros(at::OptionalDeviceGuard& device_guard) const {
 }
 
 // This function has two main goals:
-//  1) Use the user-provided jvp function to populate the the outputs' forward
+//  1) Use the user-provided jvp function to populate the outputs' forward
 //  gradient 2) Perform error checking to ensure that view and inplace ops are
 //  properly handled
 //
diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp
index 965c2dc109ae..61078b22d0c4 100644
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@@ -411,7 +411,7 @@ std::vector<Node*> get_current_graph_task_execution_order() {
   }
 
   // We could potentially check if there is only a single device here
-  // but explicitly require this context doens't seem bad either
+  // but explicitly require this context doesn't seem bad either
   TORCH_CHECK(
       !c10::AutogradState::get_tls_state().get_multithreading_enabled(),
       "get_current_graph_task_execution_order expects the current backward to be "
@@ -849,7 +849,7 @@ void validate_outputs(
     if (grad.layout() != metadata.layout()) {
       // TODO: Currently we only support (*, Sparse) combination for
       // (tensor.layout(), tensor.grad.layout()) In future, there will be an
-      // oppportunity to support more combinations of layouts if they are
+      // opportunity to support more combinations of layouts if they are
       // composable (example., operations like addition etc., are well defined
       // between tensors of different layouts.), as well as all parts of
       // autograd like AccumulateGrad correctly handle this. We allow grad to be
@@ -1501,7 +1501,7 @@ void GraphTask::init_to_execute(
   // recursion, but the actual code does this iteratively. Refer to the
   // numbering to see how the actual code corresponds. A difference to note is
   // that in the iterative version, when you are working with the current Node,
-  // you are reponsible to update your parent's is_needed after all your
+  // you are responsible to update your parent's is_needed after all your
   // children have been updated.
   //
   // is_needed = {fn: True for fn in outputs}             # (0)
diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h
index 05ba3edecf07..8fbf4104a1fb 100644
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@@ -560,7 +560,7 @@ struct TORCH_API Node : std::enable_shared_from_this<Node> {
   variable_list traced_apply(variable_list inputs);
 
   // Sequence number used to correlate backward nodes with forward ops in the
-  // profiler and provide determinisim in the engine.
+  // profiler and provide determinism in the engine.
   // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   const uint64_t sequence_nr_;
 
diff --git a/torch/csrc/autograd/functions/accumulate_grad.h b/torch/csrc/autograd/functions/accumulate_grad.h
index 15c266faed54..4e3768d33492 100644
--- a/torch/csrc/autograd/functions/accumulate_grad.h
+++ b/torch/csrc/autograd/functions/accumulate_grad.h
@@ -138,7 +138,7 @@ struct TORCH_API AccumulateGrad : public Node {
         // shallow copy. We need a shallow copy so that modifying the original
         // grad tensor doesn't modify the grad we accumulate.
         // We only skip clone if indices and values themselves are contiguous
-        // for backward compatiblity reasons. Since without this optimization,
+        // for backward compatibility reasons. Since without this optimization,
         // earlier we would clone the entire SparseTensor which cloned indices
         // and values.
         // For details see https://github.com/pytorch/pytorch/issues/34375.
diff --git a/torch/csrc/autograd/graph_task.h b/torch/csrc/autograd/graph_task.h
index 4d0a7ea84fe7..6256e44b3987 100644
--- a/torch/csrc/autograd/graph_task.h
+++ b/torch/csrc/autograd/graph_task.h
@@ -143,7 +143,7 @@ struct GraphTask : std::enable_shared_from_this<GraphTask> {
 
   // The value of worker_device in the thread that created this task.
   // See Note [Reentrant backwards]
-  // Safe to read owner_ and reentrant_depth_ without synchronizaton
+  // Safe to read owner_ and reentrant_depth_ without synchronization
   int owner_;
   // The number of parent graph tasks for this graph task
   const int reentrant_depth_;
diff --git a/torch/csrc/autograd/input_buffer.cpp b/torch/csrc/autograd/input_buffer.cpp
index a8d8b9880faa..f1e3b6981719 100644
--- a/torch/csrc/autograd/input_buffer.cpp
+++ b/torch/csrc/autograd/input_buffer.cpp
@@ -148,7 +148,7 @@ void InputBuffer::add(
   //  (4) var is a CUDA variable and it shares a device with the producer but
   //  not the consumer:
   //       (4a) Uses the producer device's default stream as the accumulation
-  //       stream (4b) Syncs the accumulation stream with the the producer's
+  //       stream (4b) Syncs the accumulation stream with the producer's
   //       stream (4c) Accumulates.
   //  (5) var is a CUDA variable and it does not share a device with the
   //  consumer or producer.
diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h
index 37764c480e8a..69277c90d186 100644
--- a/torch/csrc/autograd/profiler_kineto.h
+++ b/torch/csrc/autograd/profiler_kineto.h
@@ -109,7 +109,7 @@ struct TORCH_API ProfilerResult {
  * For example, if part of the model is lowered to a dsp backend, then
  * the execution of that part of the model is delegated to the backend.
  * When backend finishes execution it has an option to provide profiling
- * information (latency only at th emoment) corresponding to different operators
+ * information (latency only at the moment) corresponding to different operators
  * that were executed in the backend.
  * When such events are recorded by backend using this API, the event
  * records will be collected by active kineto profiler. If no kineto profiler
diff --git a/torch/csrc/autograd/profiler_legacy.cpp b/torch/csrc/autograd/profiler_legacy.cpp
index 35b8fac7e876..388695957e45 100644
--- a/torch/csrc/autograd/profiler_legacy.cpp
+++ b/torch/csrc/autograd/profiler_legacy.cpp
@@ -44,13 +44,13 @@ namespace profiler {
 // mapping. A corresponding entry is removed when the guard is destroyed,
 // potentially revealing the previously set value for the same slot.
 //
-// For the async tasks, slots previuosly set in the main thread before
+// For the async tasks, slots previously set in the main thread before
 // launching of an async task are shared and visible in the async task.
 //
 // On the other hand, any adding or overwriting of the mapping by the
 // async task is not visible to the main thread and any modification
 // (including removal of the entries) in the main thread is not visible
-// to the async task if it happends after launching the task.
+// to the async task if it happens after launching the task.
 //
 // We use ThreadLocalDebugInfo (slot PROFILER_STATE) to store profiler config,
 // as well as a list of events that happen during profiling.
diff --git a/torch/csrc/autograd/saved_variable.cpp b/torch/csrc/autograd/saved_variable.cpp
index d438205e8947..52fab04c336f 100644
--- a/torch/csrc/autograd/saved_variable.cpp
+++ b/torch/csrc/autograd/saved_variable.cpp
@@ -196,7 +196,7 @@ Variable SavedVariable::unpack(std::shared_ptr<Node> saved_for) const {
   }
 
   // The version counter is correct.
-  // Additionnally, if we deal with a non-leaf variable, we have its correct
+  // Additionally, if we deal with a non-leaf variable, we have its correct
   // grad_fn.
 
   // If we have the original variable, we simply return it
diff --git a/torch/csrc/autograd/saved_variable.h b/torch/csrc/autograd/saved_variable.h
index 6861f2f2f690..8100e6e2bb4f 100644
--- a/torch/csrc/autograd/saved_variable.h
+++ b/torch/csrc/autograd/saved_variable.h
@@ -56,7 +56,7 @@ class TORCH_API SavedVariable {
   // we fall into the second case and its metadata is also saved separately.
   // In that case, the grad_fn must be passed in to the unpack function when
   // reconstructing the Variable (except when we are doing an inplace operation
-  // on a view, see below). The field saved_orignal_ below reflects the two
+  // on a view, see below). The field saved_original_ below reflects the two
   // cases: its value is true in the first case and false in the second case.
   // The value data_.defined() can be false in three cases:
   // 1. SavedVariable was constructed without a Tensor (the value to save is
diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp
index f6fcb1083d6e..dacbe90d13be 100644
--- a/torch/csrc/autograd/variable.cpp
+++ b/torch/csrc/autograd/variable.cpp
@@ -664,14 +664,14 @@ const std::shared_ptr<torch::autograd::Node>& VariableHooks::grad_fn(
       //   self = inplace_op(self)
       //
       // For CPU/CUDA backends, we employ one AsStridedBackward0 Node to
-      // represent the chain of view backward ops for effienciency.
+      // represent the chain of view backward ops for efficiency.
       //
       // However in XLA backend we don't have full support of
       // AsStridedBackward0, we instead run a full forward pass with a tensor
       // that requires gradient to get proper grad_fn setup, then save it to
       // DifferentiableViewMeta for future use. This is fairly cheap for XLA
       // lazy tensor approach (but would be really expensive for CPU/CUDA). XLA
-      // Tensor only run thorugh VariableType dispatch and lower the forward
+      // Tensor only run through VariableType dispatch and lower the forward
       // pass to a XLA HLO graph, then we take grad_fn and never materialize the
       // tensor content. So we only construct the graph but not execute it,
       // which is a fairly cheap operation to do.