From 69aa6b4bb92b54cf24568236d35009a476ee906f Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Mon, 6 Mar 2023 18:05:10 +0000 Subject: [PATCH] fix typo in comments under torch/csrc/autograd (#96061) This PR fixes typos in comments of `.cpp` and `.h` files under `torch/csrc/autograd` directory Pull Request resolved: https://github.com/pytorch/pytorch/pull/96061 Approved by: https://github.com/soulitzer --- torch/csrc/autograd/FunctionsManual.cpp | 34 +++++++++---------- torch/csrc/autograd/VariableTypeUtils.h | 2 +- torch/csrc/autograd/autograd_meta.cpp | 2 +- .../autograd_not_implemented_fallback.cpp | 4 +-- torch/csrc/autograd/custom_function.cpp | 2 +- torch/csrc/autograd/engine.cpp | 6 ++-- torch/csrc/autograd/function.h | 2 +- .../csrc/autograd/functions/accumulate_grad.h | 2 +- torch/csrc/autograd/graph_task.h | 2 +- torch/csrc/autograd/input_buffer.cpp | 2 +- torch/csrc/autograd/profiler_kineto.h | 2 +- torch/csrc/autograd/profiler_legacy.cpp | 4 +-- torch/csrc/autograd/saved_variable.cpp | 2 +- torch/csrc/autograd/saved_variable.h | 2 +- torch/csrc/autograd/variable.cpp | 4 +-- 15 files changed, 36 insertions(+), 36 deletions(-) diff --git a/torch/csrc/autograd/FunctionsManual.cpp b/torch/csrc/autograd/FunctionsManual.cpp index 5e53972efa69..3a5d1bdc8623 100644 --- a/torch/csrc/autograd/FunctionsManual.cpp +++ b/torch/csrc/autograd/FunctionsManual.cpp @@ -1684,7 +1684,7 @@ Tensor cholesky_jvp(const Tensor& dA, const Tensor& L, bool upper) { // L^{-1}dA(L^{-H}) = L^{-1}dL + (L^{-1}dL)^H // = sym(L^{-1}dL) // where sym(X) = X + X^H - // A short computaiton gives that the inverse of sym is given by + // A short computation gives that the inverse of sym is given by // \pi(X) = X.tril() - 0.5*diag(X) // so // dL = L\pi(L^{-1}dA(L^{-H})) @@ -1787,8 +1787,8 @@ Tensor cholesky_inverse_jvp( // of Ap^i, A^j, dA^k with i, j, k in {1, H}, where X^H = X.mH(). To prove that, // note (A Ap)^H = A Ap and (Ap A)^H = Ap A, which could be shown by taking the // product between the SVD decompositions of A and Ap. Consider the -// conjugate-tranposed [2]: (A Ap A)^H = A^H (A Ap) = A^H. By differentiating it -// we get: dA^H A Ap + A^H dA Ap + A^H A dAp = dA^H. By multiplying from the +// conjugate-transposed [2]: (A Ap A)^H = A^H (A Ap) = A^H. By differentiating +// it we get: dA^H A Ap + A^H dA Ap + A^H A dAp = dA^H. By multiplying from the // left by Ap^H and using Ap^H A^H = (A Ap)^H = A Ap: Ap^H dA^H A Ap + A Ap dA // Ap + A Ap A dAp = Ap^H dA^H. By multiplying from the left by Ap and by // applying [1] and [2] repeatedly until impossible we get: Ap Ap^H dA^H A Ap + @@ -2368,7 +2368,7 @@ Tensor softplus_double_backward( // this later) // 4. Return the as_strided view of the storage tensor using input geometry. // -// In step (2), if the output tensor does't have overlapping memory, we can +// In step (2), if the output tensor doesn't have overlapping memory, we can // safely scatter (`storage.as_strided(output_geometry).copy_(grad)`); // otherwise, we must use `index_add` as gradients at different indices may need // to be summed to a single location. @@ -2501,12 +2501,12 @@ Tensor softplus_double_backward( // // Note that all values in `S(n)` are the same (they point to the same // memory location anyways, so this step doesn't change anything, but -// effectively avoids having the denpendency on the layout of `input`. +// effectively avoids having the dependency on the layout of `input`. // I.e., the result holds fixed regardless of the layout of `input`, as // long as `input_stride` is fixed. // -// NOTE: for forward pass, we can equivalently simply selet any one of -// `S(n)` as `storage[n]`. However, cosnidering this as an average +// NOTE: for forward pass, we can equivalently simply select any one of +// `S(n)` as `storage[n]`. However, considering this as an average // operation makes backward easier (so all values in set // `{ grad_input[i] : i in S(n) }` are the same, and it can use the // same geometry as input). @@ -2645,7 +2645,7 @@ Tensor softplus_double_backward( // stride[B[j]] // // Then the invariant is obviously satisfied at every dimension -// in this block if it is satisfied at dimnesion B[-1]. It only +// in this block if it is satisfied at dimension B[-1]. It only // remains to show that it is satisfied at the last dimension in // each block. // @@ -3212,7 +3212,7 @@ Tensor svd_backward( // where CP(n-1) is the complex projective space of dimension n-1. // In other words, M is just the complex projective space, and pi is (pretty // similar to) the usual principal bundle from S^{2n-1} to CP(n-1). The case k - // > 1 is the same, but requiring a linear inependence condition between the + // > 1 is the same, but requiring a linear independence condition between the // vectors from the different S^{2n-1} or CP(n-1). // // Note that this is a U(1)^k-bundle. In plain words, this means that the @@ -3672,14 +3672,14 @@ Tensor linalg_qr_backward( const Tensor& Q, const Tensor& R, const c10::string_view mode) { - // Nb. We won't be too formal below, as writing this proof formaly is a pain + // Nb. We won't be too formal below, as writing this proof formally is a pain // We'll link here a formal writing of all this at some point in the future // // Case m >= n // dQ = dAR^{-1} - Qsyminv(sym(Q^H dA R^{-1})) // dR = syminv(sym(Q^H dA R^{-1}))R // - // With the notation from the JVP formla, the only two computations that we + // With the notation from the JVP formula, the only two computations that we // need are syminv*(R) = 0.5 * (R.triu() + R.triu()^H - Re diag(R)) sym*(X) = // 2 * X Using these, after a few simplifications we get that gA = (gQ + // syminvadj(triu(gR R^H - Q^H gQ)))R^{-H} @@ -4712,14 +4712,14 @@ std::tuple _trilinear_backward( } Tensor log1p_backward(const Tensor& grad, const Tensor& self) { - // We must conditionally initalize this using to_dense if sparse, sparse + // We must conditionally initialize this using to_dense if sparse, sparse // addition is not supported without exact shape match Tensor self_p1_conj; if (self.layout() == c10::kSparse || self.layout() == c10::kSparseCsr || self.layout() == c10::kSparseCsc || self.layout() == c10::kSparseBsr || self.layout() == c10::kSparseBsc) { // The warning only applies to the sparsity of self, dense grad is never - // materialized so if self is strided and grad is sparse nothing unepected + // materialized so if self is strided and grad is sparse nothing unexpected // happens memory wise TORCH_WARN( "log1p_backward: received self with sparse layout, but backward requires materialization of a dense tensor with this shape"); @@ -4959,7 +4959,7 @@ std::tuple householder_product_backward( // better performance bool modify_K_in_place = !at::GradMode::is_enabled(); - // This method exploites that at k-th iteration vector v_k has only elements + // This method exploits that at k-th iteration vector v_k has only elements // v_k[k:] which are non-zero. auto update_grad = [&m]( int64_t k, @@ -5217,7 +5217,7 @@ std::tuple ormqr_backward( if (self_requires_grad || tau_requires_grad) { if (left ^ transpose) { // Assume left = true and transpose = false. The case with - // left = false and tranpose = true is very much similar with just + // left = false and transpose = true is very much similar with just // transposed arguments passed into householder_product_backward. // Ormqr computes B = H_1 * ... * H_k * A. // The sensivity wrt H_i is given by (see notes in @@ -6068,7 +6068,7 @@ Tensor gather_with_keepdimed_indices( // P^T dA1 = dL U1 + L dU1 => [left-multiply by L^{-1}, right-multiply by // U1^{-1}] L^{-1} P^T dA1 U1^{-1} = L^{-1} dL + dU1 U1^{-1} (**). Note, L is // lower-triangular, and so is its inverse, hence L^{-1} dL is lower-triangular. -// Also, since the diagonal of L (all ones) is never exposed explicity (packed +// Also, since the diagonal of L (all ones) is never exposed explicitly (packed // representation), the diagonal of dL is zero, and hence diag(L^{-1} dL) = 0. // Assuming that U1 is full-rank, similarly, dU1 U1^{-1} is upper-triangular. // Combining these observations we conclude: @@ -6351,7 +6351,7 @@ Tensor logsumexp_jvp( const Tensor& self_t, IntArrayRef dim, bool keepdim) { - // NB: for simplicitly, we recompute some values that can be reused from + // NB: for simplicity, we recompute some values that can be reused from // forward auto self_p_exp = [&self_p, &dim]() { if (self_p.sym_numel() > 0) { diff --git a/torch/csrc/autograd/VariableTypeUtils.h b/torch/csrc/autograd/VariableTypeUtils.h index 34eda5378721..a308aea3e0a1 100644 --- a/torch/csrc/autograd/VariableTypeUtils.h +++ b/torch/csrc/autograd/VariableTypeUtils.h @@ -330,7 +330,7 @@ inline std::vector as_view( "Non-backward differentiable views must have creation_meta=CreationMeta::DEFAULT"); } if (is_fw_differentiable) { - // Check if base is a forward differentiabble view + // Check if base is a forward differentiable view auto diff_view_meta = torch::autograd::impl::get_view_autograd_meta(base); if (diff_view_meta && diff_view_meta->has_fw_view()) { const auto& base_fw_info = diff_view_meta->get_forward_view(); diff --git a/torch/csrc/autograd/autograd_meta.cpp b/torch/csrc/autograd/autograd_meta.cpp index 968bc5139141..900a5e69944b 100644 --- a/torch/csrc/autograd/autograd_meta.cpp +++ b/torch/csrc/autograd/autograd_meta.cpp @@ -276,7 +276,7 @@ const Variable& AutogradMeta::fw_grad( return ForwardGrad::undef_grad(); } - // Ensure that concurent fw_grad() "reads" are thread safe + // Ensure that concurrent fw_grad() "reads" are thread safe std::lock_guard lock(mutex_); const auto& direct_fw_grad = diff --git a/torch/csrc/autograd/autograd_not_implemented_fallback.cpp b/torch/csrc/autograd/autograd_not_implemented_fallback.cpp index 890a7fa3e6e9..0c0e07c84ccc 100644 --- a/torch/csrc/autograd/autograd_not_implemented_fallback.cpp +++ b/torch/csrc/autograd/autograd_not_implemented_fallback.cpp @@ -187,7 +187,7 @@ void autogradNotImplementedFallbackImpl( t.use_count() <= 1, op_name); // Okay to return undefined tensor // note(crcrpar): `_foreach_norm` returns a list of scalar Tensors and // each Tensor shares a storage of a hidden, intermediate 1D Tensor - // created inside the CUDA implemenetation. This is because the + // created inside the CUDA implementation. This is because the // reference implementation of nvidia/apex repo returns this 1D Tensor // where each element represents the norm of corresponding input Tensor, // here I want to return the same number of Tensors as the input @@ -357,7 +357,7 @@ void autogradNotImplementedInplaceOrViewFallbackImpl( ? CreationMeta::INFERENCE_MODE : (at::GradMode::is_enabled() ? CreationMeta::MULTI_OUTPUT_NODE : CreationMeta::NO_GRAD_MODE)); - // ^ pass in creation meta unecessarily even if not isDifferentiableType, + // ^ pass in creation meta unnecessarily even if not isDifferentiableType, // but we don't have that // information here anyway. stack->at(stack->size() - num_returns + aliased_output_idx) = result; diff --git a/torch/csrc/autograd/custom_function.cpp b/torch/csrc/autograd/custom_function.cpp index 05b3642c1572..527a87a87aa3 100644 --- a/torch/csrc/autograd/custom_function.cpp +++ b/torch/csrc/autograd/custom_function.cpp @@ -29,7 +29,7 @@ Variable VariableInfo::zeros(at::OptionalDeviceGuard& device_guard) const { } // This function has two main goals: -// 1) Use the user-provided jvp function to populate the the outputs' forward +// 1) Use the user-provided jvp function to populate the outputs' forward // gradient 2) Perform error checking to ensure that view and inplace ops are // properly handled // diff --git a/torch/csrc/autograd/engine.cpp b/torch/csrc/autograd/engine.cpp index 965c2dc109ae..61078b22d0c4 100644 --- a/torch/csrc/autograd/engine.cpp +++ b/torch/csrc/autograd/engine.cpp @@ -411,7 +411,7 @@ std::vector get_current_graph_task_execution_order() { } // We could potentially check if there is only a single device here - // but explicitly require this context doens't seem bad either + // but explicitly require this context doesn't seem bad either TORCH_CHECK( !c10::AutogradState::get_tls_state().get_multithreading_enabled(), "get_current_graph_task_execution_order expects the current backward to be " @@ -849,7 +849,7 @@ void validate_outputs( if (grad.layout() != metadata.layout()) { // TODO: Currently we only support (*, Sparse) combination for // (tensor.layout(), tensor.grad.layout()) In future, there will be an - // oppportunity to support more combinations of layouts if they are + // opportunity to support more combinations of layouts if they are // composable (example., operations like addition etc., are well defined // between tensors of different layouts.), as well as all parts of // autograd like AccumulateGrad correctly handle this. We allow grad to be @@ -1501,7 +1501,7 @@ void GraphTask::init_to_execute( // recursion, but the actual code does this iteratively. Refer to the // numbering to see how the actual code corresponds. A difference to note is // that in the iterative version, when you are working with the current Node, - // you are reponsible to update your parent's is_needed after all your + // you are responsible to update your parent's is_needed after all your // children have been updated. // // is_needed = {fn: True for fn in outputs} # (0) diff --git a/torch/csrc/autograd/function.h b/torch/csrc/autograd/function.h index 05ba3edecf07..8fbf4104a1fb 100644 --- a/torch/csrc/autograd/function.h +++ b/torch/csrc/autograd/function.h @@ -560,7 +560,7 @@ struct TORCH_API Node : std::enable_shared_from_this { variable_list traced_apply(variable_list inputs); // Sequence number used to correlate backward nodes with forward ops in the - // profiler and provide determinisim in the engine. + // profiler and provide determinism in the engine. // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes) const uint64_t sequence_nr_; diff --git a/torch/csrc/autograd/functions/accumulate_grad.h b/torch/csrc/autograd/functions/accumulate_grad.h index 15c266faed54..4e3768d33492 100644 --- a/torch/csrc/autograd/functions/accumulate_grad.h +++ b/torch/csrc/autograd/functions/accumulate_grad.h @@ -138,7 +138,7 @@ struct TORCH_API AccumulateGrad : public Node { // shallow copy. We need a shallow copy so that modifying the original // grad tensor doesn't modify the grad we accumulate. // We only skip clone if indices and values themselves are contiguous - // for backward compatiblity reasons. Since without this optimization, + // for backward compatibility reasons. Since without this optimization, // earlier we would clone the entire SparseTensor which cloned indices // and values. // For details see https://github.com/pytorch/pytorch/issues/34375. diff --git a/torch/csrc/autograd/graph_task.h b/torch/csrc/autograd/graph_task.h index 4d0a7ea84fe7..6256e44b3987 100644 --- a/torch/csrc/autograd/graph_task.h +++ b/torch/csrc/autograd/graph_task.h @@ -143,7 +143,7 @@ struct GraphTask : std::enable_shared_from_this { // The value of worker_device in the thread that created this task. // See Note [Reentrant backwards] - // Safe to read owner_ and reentrant_depth_ without synchronizaton + // Safe to read owner_ and reentrant_depth_ without synchronization int owner_; // The number of parent graph tasks for this graph task const int reentrant_depth_; diff --git a/torch/csrc/autograd/input_buffer.cpp b/torch/csrc/autograd/input_buffer.cpp index a8d8b9880faa..f1e3b6981719 100644 --- a/torch/csrc/autograd/input_buffer.cpp +++ b/torch/csrc/autograd/input_buffer.cpp @@ -148,7 +148,7 @@ void InputBuffer::add( // (4) var is a CUDA variable and it shares a device with the producer but // not the consumer: // (4a) Uses the producer device's default stream as the accumulation - // stream (4b) Syncs the accumulation stream with the the producer's + // stream (4b) Syncs the accumulation stream with the producer's // stream (4c) Accumulates. // (5) var is a CUDA variable and it does not share a device with the // consumer or producer. diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h index 37764c480e8a..69277c90d186 100644 --- a/torch/csrc/autograd/profiler_kineto.h +++ b/torch/csrc/autograd/profiler_kineto.h @@ -109,7 +109,7 @@ struct TORCH_API ProfilerResult { * For example, if part of the model is lowered to a dsp backend, then * the execution of that part of the model is delegated to the backend. * When backend finishes execution it has an option to provide profiling - * information (latency only at th emoment) corresponding to different operators + * information (latency only at the moment) corresponding to different operators * that were executed in the backend. * When such events are recorded by backend using this API, the event * records will be collected by active kineto profiler. If no kineto profiler diff --git a/torch/csrc/autograd/profiler_legacy.cpp b/torch/csrc/autograd/profiler_legacy.cpp index 35b8fac7e876..388695957e45 100644 --- a/torch/csrc/autograd/profiler_legacy.cpp +++ b/torch/csrc/autograd/profiler_legacy.cpp @@ -44,13 +44,13 @@ namespace profiler { // mapping. A corresponding entry is removed when the guard is destroyed, // potentially revealing the previously set value for the same slot. // -// For the async tasks, slots previuosly set in the main thread before +// For the async tasks, slots previously set in the main thread before // launching of an async task are shared and visible in the async task. // // On the other hand, any adding or overwriting of the mapping by the // async task is not visible to the main thread and any modification // (including removal of the entries) in the main thread is not visible -// to the async task if it happends after launching the task. +// to the async task if it happens after launching the task. // // We use ThreadLocalDebugInfo (slot PROFILER_STATE) to store profiler config, // as well as a list of events that happen during profiling. diff --git a/torch/csrc/autograd/saved_variable.cpp b/torch/csrc/autograd/saved_variable.cpp index d438205e8947..52fab04c336f 100644 --- a/torch/csrc/autograd/saved_variable.cpp +++ b/torch/csrc/autograd/saved_variable.cpp @@ -196,7 +196,7 @@ Variable SavedVariable::unpack(std::shared_ptr saved_for) const { } // The version counter is correct. - // Additionnally, if we deal with a non-leaf variable, we have its correct + // Additionally, if we deal with a non-leaf variable, we have its correct // grad_fn. // If we have the original variable, we simply return it diff --git a/torch/csrc/autograd/saved_variable.h b/torch/csrc/autograd/saved_variable.h index 6861f2f2f690..8100e6e2bb4f 100644 --- a/torch/csrc/autograd/saved_variable.h +++ b/torch/csrc/autograd/saved_variable.h @@ -56,7 +56,7 @@ class TORCH_API SavedVariable { // we fall into the second case and its metadata is also saved separately. // In that case, the grad_fn must be passed in to the unpack function when // reconstructing the Variable (except when we are doing an inplace operation - // on a view, see below). The field saved_orignal_ below reflects the two + // on a view, see below). The field saved_original_ below reflects the two // cases: its value is true in the first case and false in the second case. // The value data_.defined() can be false in three cases: // 1. SavedVariable was constructed without a Tensor (the value to save is diff --git a/torch/csrc/autograd/variable.cpp b/torch/csrc/autograd/variable.cpp index f6fcb1083d6e..dacbe90d13be 100644 --- a/torch/csrc/autograd/variable.cpp +++ b/torch/csrc/autograd/variable.cpp @@ -664,14 +664,14 @@ const std::shared_ptr& VariableHooks::grad_fn( // self = inplace_op(self) // // For CPU/CUDA backends, we employ one AsStridedBackward0 Node to - // represent the chain of view backward ops for effienciency. + // represent the chain of view backward ops for efficiency. // // However in XLA backend we don't have full support of // AsStridedBackward0, we instead run a full forward pass with a tensor // that requires gradient to get proper grad_fn setup, then save it to // DifferentiableViewMeta for future use. This is fairly cheap for XLA // lazy tensor approach (but would be really expensive for CPU/CUDA). XLA - // Tensor only run thorugh VariableType dispatch and lower the forward + // Tensor only run through VariableType dispatch and lower the forward // pass to a XLA HLO graph, then we take grad_fn and never materialize the // tensor content. So we only construct the graph but not execute it, // which is a fairly cheap operation to do.