fix typo in comments under torch/csrc/autograd (#96061)

This PR fixes typos in comments of `.cpp` and `.h` files under `torch/csrc/autograd` directory Pull Request resolved: https://github.com/pytorch/pytorch/pull/96061 Approved by: https://github.com/soulitzer
2025-10-20 21:14:14 +08:00 · 2023-03-06 18:05:10 +00:00
parent 301a28bf8c
commit 69aa6b4bb9
15 changed files with 36 additions and 36 deletions
--- a/torch/csrc/autograd/FunctionsManual.cpp
+++ b/torch/csrc/autograd/FunctionsManual.cpp
@ -1684,7 +1684,7 @@ Tensor cholesky_jvp(const Tensor& dA, const Tensor& L, bool upper) {
  // L^{-1}dA(L^{-H}) = L^{-1}dL + (L^{-1}dL)^H
  //               = sym(L^{-1}dL)
  // where sym(X) = X + X^H
-  // A short computaiton gives that the inverse of sym is given by
+  // A short computation gives that the inverse of sym is given by
  // \pi(X) = X.tril() - 0.5*diag(X)
  // so
  // dL = L\pi(L^{-1}dA(L^{-H}))
@ -1787,8 +1787,8 @@ Tensor cholesky_inverse_jvp(
 // of Ap^i, A^j, dA^k with i, j, k in {1, H}, where X^H = X.mH(). To prove that,
 // note (A Ap)^H = A Ap and (Ap A)^H = Ap A, which could be shown by taking the
 // product between the SVD decompositions of A and Ap. Consider the
-// conjugate-tranposed [2]: (A Ap A)^H = A^H (A Ap) = A^H. By differentiating it
-// we get: dA^H A Ap + A^H dA Ap + A^H A dAp = dA^H. By multiplying from the
+// conjugate-transposed [2]: (A Ap A)^H = A^H (A Ap) = A^H. By differentiating
+// it we get: dA^H A Ap + A^H dA Ap + A^H A dAp = dA^H. By multiplying from the
 // left by Ap^H and using Ap^H A^H = (A Ap)^H = A Ap: Ap^H dA^H A Ap + A Ap dA
 // Ap + A Ap A dAp = Ap^H dA^H. By multiplying from the left by Ap and by
 // applying [1] and [2] repeatedly until impossible we get: Ap Ap^H dA^H A Ap +
@ -2368,7 +2368,7 @@ Tensor softplus_double_backward(
 //           this later)
 //   4. Return the as_strided view of the storage tensor using input geometry.
 //
-// In step (2), if the output tensor does't have overlapping memory, we can
+// In step (2), if the output tensor doesn't have overlapping memory, we can
 // safely scatter (`storage.as_strided(output_geometry).copy_(grad)`);
 // otherwise, we must use `index_add` as gradients at different indices may need
 // to be summed to a single location.
@ -2501,12 +2501,12 @@ Tensor softplus_double_backward(
 //
 //        Note that all values in `S(n)` are the same (they point to the same
 //        memory location anyways, so this step doesn't change anything, but
-//        effectively avoids having the denpendency on the layout of `input`.
+//        effectively avoids having the dependency on the layout of `input`.
 //        I.e., the result holds fixed regardless of the layout of `input`, as
 //        long as `input_stride` is fixed.
 //
-//      NOTE: for forward pass, we can equivalently simply selet any one of
-//            `S(n)` as `storage[n]`. However, cosnidering this as an average
+//      NOTE: for forward pass, we can equivalently simply select any one of
+//            `S(n)` as `storage[n]`. However, considering this as an average
 //            operation makes backward easier (so all values in set
 //            `{ grad_input[i] : i in S(n) }` are the same, and it can use the
 //            same geometry as input).
@ -2645,7 +2645,7 @@ Tensor softplus_double_backward(
 //                                stride[B[j]]
 //
 //              Then the invariant is obviously satisfied at every dimension
-//              in this block if it is satisfied at dimnesion B[-1]. It only
+//              in this block if it is satisfied at dimension B[-1]. It only
 //              remains to show that it is satisfied at the last dimension in
 //              each block.
 //
@ -3212,7 +3212,7 @@ Tensor svd_backward(
  // where CP(n-1) is the complex projective space of dimension n-1.
  // In other words, M is just the complex projective space, and pi is (pretty
  // similar to) the usual principal bundle from S^{2n-1} to CP(n-1). The case k
-  // > 1 is the same, but requiring a linear inependence condition between the
+  // > 1 is the same, but requiring a linear independence condition between the
  // vectors from the different S^{2n-1} or CP(n-1).
  //
  // Note that this is a U(1)^k-bundle. In plain words, this means that the
@ -3672,14 +3672,14 @@ Tensor linalg_qr_backward(
    const Tensor& Q,
    const Tensor& R,
    const c10::string_view mode) {
-  // Nb. We won't be too formal below, as writing this proof formaly is a pain
+  // Nb. We won't be too formal below, as writing this proof formally is a pain
  // We'll link here a formal writing of all this at some point in the future
  //
  // Case m >= n
  // dQ = dAR^{-1} - Qsyminv(sym(Q^H dA R^{-1}))
  // dR = syminv(sym(Q^H dA R^{-1}))R
  //
-  // With the notation from the JVP formla, the only two computations that we
+  // With the notation from the JVP formula, the only two computations that we
  // need are syminv*(R) = 0.5 * (R.triu() + R.triu()^H - Re diag(R)) sym*(X) =
  // 2 * X Using these, after a few simplifications we get that gA = (gQ +
  // syminvadj(triu(gR R^H - Q^H gQ)))R^{-H}
@ -4712,14 +4712,14 @@ std::tuple<Tensor, Tensor, Tensor> _trilinear_backward(
 }

 Tensor log1p_backward(const Tensor& grad, const Tensor& self) {
-  // We must conditionally initalize this using to_dense if sparse, sparse
+  // We must conditionally initialize this using to_dense if sparse, sparse
  // addition is not supported without exact shape match
  Tensor self_p1_conj;
  if (self.layout() == c10::kSparse || self.layout() == c10::kSparseCsr ||
      self.layout() == c10::kSparseCsc || self.layout() == c10::kSparseBsr ||
      self.layout() == c10::kSparseBsc) {
    // The warning only applies to the sparsity of self, dense grad is never
-    // materialized so if self is strided and grad is sparse nothing unepected
+    // materialized so if self is strided and grad is sparse nothing unexpected
    // happens memory wise
    TORCH_WARN(
        "log1p_backward: received self with sparse layout, but backward requires materialization of a dense tensor with this shape");
@ -4959,7 +4959,7 @@ std::tuple<Tensor, Tensor> householder_product_backward(
  // better performance
  bool modify_K_in_place = !at::GradMode::is_enabled();

-  // This method exploites that at k-th iteration vector v_k has only elements
+  // This method exploits that at k-th iteration vector v_k has only elements
  // v_k[k:] which are non-zero.
  auto update_grad = [&m](
                         int64_t k,
@ -5217,7 +5217,7 @@ std::tuple<Tensor, Tensor, Tensor> ormqr_backward(
  if (self_requires_grad || tau_requires_grad) {
    if (left ^ transpose) {
      // Assume left = true and transpose = false. The case with
-      // left = false and tranpose = true is very much similar with just
+      // left = false and transpose = true is very much similar with just
      // transposed arguments passed into householder_product_backward.
      // Ormqr computes B = H_1 * ... * H_k * A.
      // The sensivity wrt H_i is given by (see notes in
@ -6068,7 +6068,7 @@ Tensor gather_with_keepdimed_indices(
 // P^T dA1 = dL U1 + L dU1 => [left-multiply by L^{-1}, right-multiply by
 // U1^{-1}] L^{-1} P^T dA1 U1^{-1} = L^{-1} dL + dU1 U1^{-1} (**). Note, L is
 // lower-triangular, and so is its inverse, hence L^{-1} dL is lower-triangular.
-// Also, since the diagonal of L (all ones) is never exposed explicity (packed
+// Also, since the diagonal of L (all ones) is never exposed explicitly (packed
 // representation), the diagonal of dL is zero, and hence diag(L^{-1} dL) = 0.
 // Assuming that U1 is full-rank, similarly, dU1 U1^{-1} is upper-triangular.
 // Combining these observations we conclude:
@ -6351,7 +6351,7 @@ Tensor logsumexp_jvp(
    const Tensor& self_t,
    IntArrayRef dim,
    bool keepdim) {
-  // NB: for simplicitly, we recompute some values that can be reused from
+  // NB: for simplicity, we recompute some values that can be reused from
  // forward
  auto self_p_exp = [&self_p, &dim]() {
    if (self_p.sym_numel() > 0) {
--- a/torch/csrc/autograd/VariableTypeUtils.h
+++ b/torch/csrc/autograd/VariableTypeUtils.h
@ -330,7 +330,7 @@ inline std::vector<at::Tensor> as_view(
        "Non-backward differentiable views must have creation_meta=CreationMeta::DEFAULT");
  }
  if (is_fw_differentiable) {
-    // Check if base is a forward differentiabble view
+    // Check if base is a forward differentiable view
    auto diff_view_meta = torch::autograd::impl::get_view_autograd_meta(base);
    if (diff_view_meta && diff_view_meta->has_fw_view()) {
      const auto& base_fw_info = diff_view_meta->get_forward_view();
--- a/torch/csrc/autograd/autograd_meta.cpp
+++ b/torch/csrc/autograd/autograd_meta.cpp
@ -276,7 +276,7 @@ const Variable& AutogradMeta::fw_grad(
    return ForwardGrad::undef_grad();
  }

-  // Ensure that concurent fw_grad() "reads" are thread safe
+  // Ensure that concurrent fw_grad() "reads" are thread safe
  std::lock_guard<std::mutex> lock(mutex_);

  const auto& direct_fw_grad =
--- a/torch/csrc/autograd/autograd_not_implemented_fallback.cpp
+++ b/torch/csrc/autograd/autograd_not_implemented_fallback.cpp
@ -187,7 +187,7 @@ void autogradNotImplementedFallbackImpl(
              t.use_count() <= 1, op_name); // Okay to return undefined tensor
        // note(crcrpar): `_foreach_norm` returns a list of scalar Tensors and
        // each Tensor shares a storage of a hidden, intermediate 1D Tensor
-        // created inside the CUDA implemenetation. This is because the
+        // created inside the CUDA implementation. This is because the
        // reference implementation of nvidia/apex repo returns this 1D Tensor
        // where each element represents the norm of corresponding input Tensor,
        // here I want to return the same number of Tensors as the input
@ -357,7 +357,7 @@ void autogradNotImplementedInplaceOrViewFallbackImpl(
              ? CreationMeta::INFERENCE_MODE
              : (at::GradMode::is_enabled() ? CreationMeta::MULTI_OUTPUT_NODE
                                            : CreationMeta::NO_GRAD_MODE));
-      // ^ pass in creation meta unecessarily even if not isDifferentiableType,
+      // ^ pass in creation meta unnecessarily even if not isDifferentiableType,
      // but we don't have that
      //   information here anyway.
      stack->at(stack->size() - num_returns + aliased_output_idx) = result;
--- a/torch/csrc/autograd/custom_function.cpp
+++ b/torch/csrc/autograd/custom_function.cpp
@ -29,7 +29,7 @@ Variable VariableInfo::zeros(at::OptionalDeviceGuard& device_guard) const {
 }

 // This function has two main goals:
-//  1) Use the user-provided jvp function to populate the the outputs' forward
+//  1) Use the user-provided jvp function to populate the outputs' forward
 //  gradient 2) Perform error checking to ensure that view and inplace ops are
 //  properly handled
 //
--- a/torch/csrc/autograd/engine.cpp
+++ b/torch/csrc/autograd/engine.cpp
@ -411,7 +411,7 @@ std::vector<Node*> get_current_graph_task_execution_order() {
  }

  // We could potentially check if there is only a single device here
-  // but explicitly require this context doens't seem bad either
+  // but explicitly require this context doesn't seem bad either
  TORCH_CHECK(
      !c10::AutogradState::get_tls_state().get_multithreading_enabled(),
      "get_current_graph_task_execution_order expects the current backward to be "
@ -849,7 +849,7 @@ void validate_outputs(
    if (grad.layout() != metadata.layout()) {
      // TODO: Currently we only support (*, Sparse) combination for
      // (tensor.layout(), tensor.grad.layout()) In future, there will be an
-      // oppportunity to support more combinations of layouts if they are
+      // opportunity to support more combinations of layouts if they are
      // composable (example., operations like addition etc., are well defined
      // between tensors of different layouts.), as well as all parts of
      // autograd like AccumulateGrad correctly handle this. We allow grad to be
@ -1501,7 +1501,7 @@ void GraphTask::init_to_execute(
  // recursion, but the actual code does this iteratively. Refer to the
  // numbering to see how the actual code corresponds. A difference to note is
  // that in the iterative version, when you are working with the current Node,
-  // you are reponsible to update your parent's is_needed after all your
+  // you are responsible to update your parent's is_needed after all your
  // children have been updated.
  //
  // is_needed = {fn: True for fn in outputs}             # (0)
--- a/torch/csrc/autograd/function.h
+++ b/torch/csrc/autograd/function.h
@ -560,7 +560,7 @@ struct TORCH_API Node : std::enable_shared_from_this<Node> {
  variable_list traced_apply(variable_list inputs);

  // Sequence number used to correlate backward nodes with forward ops in the
-  // profiler and provide determinisim in the engine.
+  // profiler and provide determinism in the engine.
  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
  const uint64_t sequence_nr_;

--- a/torch/csrc/autograd/functions/accumulate_grad.h
+++ b/torch/csrc/autograd/functions/accumulate_grad.h
@ -138,7 +138,7 @@ struct TORCH_API AccumulateGrad : public Node {
        // shallow copy. We need a shallow copy so that modifying the original
        // grad tensor doesn't modify the grad we accumulate.
        // We only skip clone if indices and values themselves are contiguous
-        // for backward compatiblity reasons. Since without this optimization,
+        // for backward compatibility reasons. Since without this optimization,
        // earlier we would clone the entire SparseTensor which cloned indices
        // and values.
        // For details see https://github.com/pytorch/pytorch/issues/34375.
--- a/torch/csrc/autograd/graph_task.h
+++ b/torch/csrc/autograd/graph_task.h
@ -143,7 +143,7 @@ struct GraphTask : std::enable_shared_from_this<GraphTask> {

  // The value of worker_device in the thread that created this task.
  // See Note [Reentrant backwards]
-  // Safe to read owner_ and reentrant_depth_ without synchronizaton
+  // Safe to read owner_ and reentrant_depth_ without synchronization
  int owner_;
  // The number of parent graph tasks for this graph task
  const int reentrant_depth_;
--- a/torch/csrc/autograd/input_buffer.cpp
+++ b/torch/csrc/autograd/input_buffer.cpp
@ -148,7 +148,7 @@ void InputBuffer::add(
  //  (4) var is a CUDA variable and it shares a device with the producer but
  //  not the consumer:
  //       (4a) Uses the producer device's default stream as the accumulation
-  //       stream (4b) Syncs the accumulation stream with the the producer's
+  //       stream (4b) Syncs the accumulation stream with the producer's
  //       stream (4c) Accumulates.
  //  (5) var is a CUDA variable and it does not share a device with the
  //  consumer or producer.
--- a/torch/csrc/autograd/profiler_kineto.h
+++ b/torch/csrc/autograd/profiler_kineto.h
@ -109,7 +109,7 @@ struct TORCH_API ProfilerResult {
 * For example, if part of the model is lowered to a dsp backend, then
 * the execution of that part of the model is delegated to the backend.
 * When backend finishes execution it has an option to provide profiling
- * information (latency only at th emoment) corresponding to different operators
+ * information (latency only at the moment) corresponding to different operators
 * that were executed in the backend.
 * When such events are recorded by backend using this API, the event
 * records will be collected by active kineto profiler. If no kineto profiler
--- a/torch/csrc/autograd/profiler_legacy.cpp
+++ b/torch/csrc/autograd/profiler_legacy.cpp
@ -44,13 +44,13 @@ namespace profiler {
 // mapping. A corresponding entry is removed when the guard is destroyed,
 // potentially revealing the previously set value for the same slot.
 //
-// For the async tasks, slots previuosly set in the main thread before
+// For the async tasks, slots previously set in the main thread before
 // launching of an async task are shared and visible in the async task.
 //
 // On the other hand, any adding or overwriting of the mapping by the
 // async task is not visible to the main thread and any modification
 // (including removal of the entries) in the main thread is not visible
-// to the async task if it happends after launching the task.
+// to the async task if it happens after launching the task.
 //
 // We use ThreadLocalDebugInfo (slot PROFILER_STATE) to store profiler config,
 // as well as a list of events that happen during profiling.
--- a/torch/csrc/autograd/saved_variable.cpp
+++ b/torch/csrc/autograd/saved_variable.cpp
@ -196,7 +196,7 @@ Variable SavedVariable::unpack(std::shared_ptr<Node> saved_for) const {
  }

  // The version counter is correct.
-  // Additionnally, if we deal with a non-leaf variable, we have its correct
+  // Additionally, if we deal with a non-leaf variable, we have its correct
  // grad_fn.

  // If we have the original variable, we simply return it
--- a/torch/csrc/autograd/saved_variable.h
+++ b/torch/csrc/autograd/saved_variable.h
@ -56,7 +56,7 @@ class TORCH_API SavedVariable {
  // we fall into the second case and its metadata is also saved separately.
  // In that case, the grad_fn must be passed in to the unpack function when
  // reconstructing the Variable (except when we are doing an inplace operation
-  // on a view, see below). The field saved_orignal_ below reflects the two
+  // on a view, see below). The field saved_original_ below reflects the two
  // cases: its value is true in the first case and false in the second case.
  // The value data_.defined() can be false in three cases:
  // 1. SavedVariable was constructed without a Tensor (the value to save is
--- a/torch/csrc/autograd/variable.cpp
+++ b/torch/csrc/autograd/variable.cpp
@ -664,14 +664,14 @@ const std::shared_ptr<torch::autograd::Node>& VariableHooks::grad_fn(
      //   self = inplace_op(self)
      //
      // For CPU/CUDA backends, we employ one AsStridedBackward0 Node to
-      // represent the chain of view backward ops for effienciency.
+      // represent the chain of view backward ops for efficiency.
      //
      // However in XLA backend we don't have full support of
      // AsStridedBackward0, we instead run a full forward pass with a tensor
      // that requires gradient to get proper grad_fn setup, then save it to
      // DifferentiableViewMeta for future use. This is fairly cheap for XLA
      // lazy tensor approach (but would be really expensive for CPU/CUDA). XLA
-      // Tensor only run thorugh VariableType dispatch and lower the forward
+      // Tensor only run through VariableType dispatch and lower the forward
      // pass to a XLA HLO graph, then we take grad_fn and never materialize the
      // tensor content. So we only construct the graph but not execute it,
      // which is a fairly cheap operation to do.