[BE][11/16] fix typos in torch/ (torch/csrc/distributed/) (#156321)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/156321 Approved by: https://github.com/jingsh ghstack dependencies: #156313, #156314, #156315, #156316, #156317, #156319
2025-10-20 12:54:11 +08:00 · 2025-06-22 22:22:34 +08:00
parent 5b210bb3a6
commit d55dc00f84
34 changed files with 58 additions and 57 deletions
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -1179,7 +1179,6 @@ exclude_patterns = [
    'torch/utils/**',
    'torch/csrc/jit/**',
    'torch/csrc/jit/[a-o]*/**',
-    'torch/csrc/distributed/**',
 ]
 init_command = [
    'python3',
--- a/tools/linter/dictionary.txt
+++ b/tools/linter/dictionary.txt
@ -24,5 +24,6 @@ rebuilt
 reenable
 reenabled
 requestor
+ser'de
 supercedes
 te
--- a/torch/csrc/distributed/autograd/engine/dist_engine.h
+++ b/torch/csrc/distributed/autograd/engine/dist_engine.h
@ -15,7 +15,7 @@ class BackwardPassCleanupGuard;

 // This is a singleton class responsible for running distributed backward
 // passes. This engine relies heavily on the vanilla autograd engine and tries
-// to re-use it as much as possible. This class is mostly responsible for the
+// to reuse it as much as possible. This class is mostly responsible for the
 // distributed aspects of autograd and tries to hook into the autograd engine
 // where convenient.

--- a/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_req.cpp
+++ b/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_req.cpp
@ -45,7 +45,7 @@ RpcWithProfilingReq::RpcWithProfilingReq(
      tensors_(std::move(tensors)),
      profilerConfig_(std::move(profilerConfig)),
      profilingKeyId_(profilingKeyId) {
-  TORCH_INTERNAL_ASSERT(wrappedRpc_ != nullptr, "wrappedRpc cant be null");
+  TORCH_INTERNAL_ASSERT(wrappedRpc_ != nullptr, "wrappedRpc can't be null");
 }

 rpc::MessageType RpcWithProfilingReq::wrappedMessageType() const {
--- a/torch/csrc/distributed/c10d/FileStore.cpp
+++ b/torch/csrc/distributed/c10d/FileStore.cpp
@ -323,7 +323,7 @@ FileStore::~FileStore() {
  auto numFinishedWorker = addHelper(cleanupKey_, 1);
  auto refCount = addHelper(refCountKey_, -1);
  // The last worker cleans up the file. If numWorkers was not initialized to
-  // a specific postive value (i.e. meaning that there was not a fixed number
+  // a specific positive value (i.e. meaning that there was not a fixed number
  // of workers), we don't attempt to clean.
  // Clean up the file if number of references is 0.
  if (refCount == 0 && numWorkers_ >= 0 && numFinishedWorker >= numWorkers_) {
--- a/torch/csrc/distributed/c10d/FlightRecorder.hpp
+++ b/torch/csrc/distributed/c10d/FlightRecorder.hpp
@ -145,7 +145,7 @@ struct FlightRecorder {
    std::optional<c10::time_t> time_discovered_started_;

    // timestamp when our CPU threads discovered that the kernel completed.
-    // will always be _after_ it actually complated, and can be the same time
+    // will always be _after_ it actually completed, and can be the same time
    // as the discovery of the start if the watchdog thread is stuck on CUDA
    // APIs
    std::optional<c10::time_t> time_discovered_completed_;
--- a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp
@ -965,7 +965,7 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::allreduce_sparse(
    const AllreduceOptions& opts) {
  // all reduce sparse calls into default allreduce which
  // implemented with all_gathering indices and values
-  // we do ths we do not have a native cuda implementation
+  // we do this we do not have a native cuda implementation
  return allreduce(inputs, opts);
 }

--- a/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp
@ -65,7 +65,7 @@ struct WorkEntry {
 // That is, The process may be multi-threaded, and multiple threads may make
 // MPI calls, but only one at a time: MPI calls are not made concurrently from
 // two distinct threads (all MPI calls are serialized). However, with
-// MPI_THREAD_SERIALIZED, ProcessGroupMPI will only support a singe process
+// MPI_THREAD_SERIALIZED, ProcessGroupMPI will only support a single process
 // group. In other words, no more than 1 process group can be created globally.
 //
 // If you would like to use multiple ProcessGroupMPI, it requires your MPI
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@ -1423,7 +1423,7 @@ void ProcessGroupNCCL::abortCommsFromMap(
 bool ProcessGroupNCCL::abortComms(
    const std::optional<std::string>& abortReason) {
  // Remove record from global ncclCommMemPoolMapMutex before aboarting,
-  // so that a new cache segment would not register to already aborded
+  // so that a new cache segment would not register to already aborted
  // communicators. Note that ncclCommMemPoolMap is a global container which may
  // contain other PG's communicators, thus we need to only erase communicators
  // for the current PG.
@ -1451,9 +1451,9 @@ void ProcessGroupNCCL::abort() {
  terminateProcessGroup_.store(true);
  watchdog_->notify();

-  // lauch abort asynchrounously and wait for it to complete or timeout
+  // launch abort asynchronously and wait for it to complete or timeout
  LOG(INFO) << logPrefix()
-            << "Launching ProcessGroupNCCL abort asynchrounously.";
+            << "Launching ProcessGroupNCCL abort asynchronously.";
  std::future<bool> fut =
      std::async(std::launch::async, [this]() { return this->abortComms(); });

@ -1655,7 +1655,7 @@ std::string ProcessGroupNCCL::HeartbeatMonitor::getNCCLWatchdogTimeoutExitMsg(

 void ProcessGroupNCCL::HeartbeatMonitor::setLastWorkListUpdateTime(
    std::chrono::time_point<std::chrono::steady_clock> time) {
-  // We intentially let the race condition to happen but this is ok
+  // We intentionally let the race condition to happen but this is ok
  // as long as we update the time, we know we are making progress.
  lastWorkListUpdateTime_ = time;
 }
@ -1761,7 +1761,7 @@ void ProcessGroupNCCL::HeartbeatMonitor::runLoop() {
      // 1. The current rank is the first to observe a timeout in watchdog.
      // (shouldDump_ was set to true by the watchdog thread).
      // 2. Other ranks detected the timeout and signal the current rank to
-      // dump. In addtion, monitor threads will dump if watchdog threads has no
+      // dump. In addition, monitor threads will dump if watchdog threads has no
      // heartbeat or dumpPipe is not empty.
      if (shouldDump_.load()) {
        errorMsg = getNCCLWatchdogTimeoutErrorMsg("this local rank");
@ -3030,7 +3030,7 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::initNCCLComm(

  bool useScalableInit = false;
  // (nranks / nroots) == 128 was the default NCCL recommended
-  // accoring to
+  // according to
  // https://github.com/pytorch/pytorch/pull/136789#discussion_r1779171615.
  auto ranksPerRoot = getCvarInt(TORCH_NCCL_RANKS_PER_ROOT, 128);
 #if defined(NCCL_HAS_INIT_RANK_SCALABLE) && defined(NCCL_HAS_CONFIG)
@ -3327,7 +3327,7 @@ c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> ProcessGroupNCCL::initWork(
    // - initially, moved record() into workEnqueue(), but found that makes it
    //   hard to get access to profilingTitle,
    //   inputs, and outputs for metadata recording, and we don't want to attach
-    //   these objects to the Work becuase it has implications for keeping those
+    //   these objects to the Work because it has implications for keeping those
    //   tensors alive longer and adds overhead when copying Work objects
    //   between threads
    r->trace_id_ = FlightRecorderCUDA::get()->record(
@ -3442,7 +3442,7 @@ void ProcessGroupNCCL::startCoalescing() {
  // ops from a coalesce group into the flight recorder, we want to have the
  // same seq_ for those ops and its 'endCoalescing' op. Hence we bump during
  // start, which has one minor downside- we burn a seq_ if someone ever does a
-  // 'start' and 'end' coalescing region without doing an operation inbetween.
+  // 'start' and 'end' coalescing region without doing an operation in between.

  coalescedDevice_.set_index(-1);
  coalescedComm_ = nullptr;
@ -3462,7 +3462,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::endCoalescing(OpType optype) {
  }
  TORCH_CHECK(
      coalescedDevice_.index() >= 0,
-      "Somthing went wrong. Did you call end_coalescing before start_coalescing?");
+      "Something went wrong. Did you call end_coalescing before start_coalescing?");

  // `coalescedComm_` should have same set of comms across collectives
  auto comm = coalescedComm_;
@ -3618,7 +3618,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
      device, rank_, opType, false, profilingTitle, inputs, outputs, enqueue);
  if (coalescing_state_) {
    // When coalescing, we record events per op that lack timing/state
-    // information becuase there is no 'work' associated with them, and then
+    // information because there is no 'work' associated with them, and then
    // later in endCoalescing we record a 'coalesced' Work which has
    // timing/state updates via watchdog thread, but lacks op metadata such as
    // input/output sizes and profilingTitle per-op in the group.
@ -3781,7 +3781,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collectiveCoalesced(
  // collective so there is no flight record and we increment seqCollective_ and
  // op_id_ together. Compare this to startCoalescing/endCoalescing flow where
  // we increment either seqP2P_ or seqCollective_ once per group and increment
-  // op_id_ once per indvidual operation within the group
+  // op_id_ once per individual operation within the group
  op_id_++;

  const auto key = getKeyFromDevice(device);
@ -4016,7 +4016,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
  c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> work;
  if (coalescing_state_) {
    // When coalescing, we record events per op that lack timing/state
-    // information becuase there is no 'work' associated with them, and then
+    // information because there is no 'work' associated with them, and then
    // later in endCoalescing we record a 'coalesced' Work which has
    // timing/state updates via watchdog thread, but lacks op metadata such as
    // input/output sizes and profilingTitle per-op in the group.
@ -4397,7 +4397,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_coalesced(
      std::make_tuple(
          static_cast<int64_t>(seqCollective_) + 1,
          false), // seq + 1 to match collective and assume only one collective
-                  // in coalesed range
+                  // in coalesced range
      std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
      tensors, // inputTensors
      tensors, // outputTensors
@ -4694,7 +4694,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather(
          // User-facing outputTensors should be held by the user until after
          // waiting on work_, or the call makes no sense. We do a stashing here
          // in case user doesn't hold the outputTensors in downstream code,
-          // which can cause an early recyle by the CachingAllocator, which can
+          // which can cause an early recycle by the CachingAllocator, which can
          // lead to segfault or data corruption.
          if (opts.asyncOp) {
            work->stashed_for_allocator_safety_->stash(outputTensors_);
@ -4742,7 +4742,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather_into_tensor_coalesced(
      std::make_tuple(
          static_cast<int64_t>(seqCollective_) + 1,
          false), // seq + 1 to match collective and assume only one collective
-                  // in coalesed range
+                  // in coalesced range
      std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
      inputs, // inputTensors
      outputs, // outputTensors
@ -4956,7 +4956,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter_tensor_coalesced(
      std::make_tuple(
          static_cast<int64_t>(seqCollective_) + 1,
          false), // seq + 1 to match collective and assume only one collective
-                  // in coalesed range
+                  // in coalesced range
      std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
      inputs, // inputTensors
      outputs, // outputTensors
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@ -1291,7 +1291,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
  // communication, the key will be "1:2" on both processes. Note: this is for
  // the scenario where there is only 1 GPU per process. When it comes to
  // multiple GPUs per process, this part may need to redesigned.
-  // TODO: we probably need a separte map for P2P comms
+  // TODO: we probably need a separate map for P2P comms
  std::unordered_map<std::string, std::shared_ptr<NCCLComm>> devNCCLCommMap_;

  // The NCCL communicators currently in process of being initialized.
@ -1316,7 +1316,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
  std::atomic<bool> hasPendingHooks_{};

  // This is the signal from watchdog threads to indicate whether the monitor
-  // thread should dump. Making it static so that it is accessiable from all the
+  // thread should dump. Making it static so that it is accessible from all the
  // PGs. With this flag, monitor thread would dump debug info under any one of
  // the three conditions:
  //
--- a/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp
+++ b/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp
@ -397,7 +397,7 @@ class WriterPayload : public c10::intrusive_ptr_target {
  void registeredInLoop() {
    /*
    This refcount increment must be matched by a reclaim call.
-    Call this method after sucessfully scheduling this handle with a loop.
+    Call this method after successfully scheduling this handle with a loop.
    */
    at::raw::intrusive_ptr::incref(this);
  }
--- a/torch/csrc/distributed/c10d/Utils.hpp
+++ b/torch/csrc/distributed/c10d/Utils.hpp
@ -573,9 +573,9 @@ using SizeType = uint64_t;
 // (https://stackoverflow.com/a/20295079), and thus `errno` should really only
 // be inspected if an error occurred.
 //
-// `success_cond` is an expression used to check if an error has happend. So for
-// `fork()`, we can use `SYSCHECK(pid = fork(), pid != -1)`. The function output
-// is stored in variable `__output` and may be used in `success_cond`.
+// `success_cond` is an expression used to check if an error has happened. So
+// for `fork()`, we can use `SYSCHECK(pid = fork(), pid != -1)`. The function
+// output is stored in variable `__output` and may be used in `success_cond`.
 #ifdef _WIN32
 #define SYSCHECK(expr, success_cond)                                           \
  while (true) {                                                               \
--- a/torch/csrc/distributed/c10d/Work.hpp
+++ b/torch/csrc/distributed/c10d/Work.hpp
@ -118,7 +118,7 @@ class TORCH_API Work : public torch::CustomClassHolder {

  // Get a Future object that would be marked as either success or failure
  // This API can be used by the user to track the completion of the work
-  // and hanlde the exception if any.
+  // and handle the exception if any.
  virtual c10::intrusive_ptr<c10::ivalue::Future> getFutureResult();

  virtual float getDuration() const;
--- a/torch/csrc/distributed/c10d/comm.hpp
+++ b/torch/csrc/distributed/c10d/comm.hpp
@ -67,7 +67,8 @@ class TORCH_API GradBucket {
    return parameters_;
  }

-  // Returns whther this bucket is the last bucket to allreduce in an iteration.
+  // Returns whether this bucket is the last bucket to allreduce in an
+  // iteration.
  bool isLast() const {
    return index_ == bucket_count_ - 1;
  }
--- a/torch/csrc/distributed/c10d/cuda/AsyncMM.cu
+++ b/torch/csrc/distributed/c10d/cuda/AsyncMM.cu
@ -5,7 +5,7 @@
 #include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
 #include <c10/cuda/CUDAGuard.h>

-// Two warninngs in Cutlass included header files
+// Two warnings in Cutlass included header files
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
 C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")

@ -163,7 +163,7 @@ at::Tensor async_input_mm_impl(

  TORCH_CHECK(
      M % num_chunks_M == 0,
-      "async_input_mm: `a.shape(0)` must be an interger multiple of `a_chunk_signals.numel()`");
+      "async_input_mm: `a.shape(0)` must be an integer multiple of `a_chunk_signals.numel()`");
  size_t chunk_size_M = M / num_chunks_M;
  size_t tile_size_M = cute::get<0>(TileShape_MNK{});

@ -248,7 +248,7 @@ at::Tensor async_input_mm_out(
  });
 #else
  TORCH_CHECK(
-      false, "async_input_mm is not currenlty supported on your device");
+      false, "async_input_mm is not currently supported on your device");
 #endif
  return out;
 }
--- a/torch/csrc/distributed/c10d/cuda/cutlass/gemm/kernel/persistent_async_input_scheduler.cuh
+++ b/torch/csrc/distributed/c10d/cuda/cutlass/gemm/kernel/persistent_async_input_scheduler.cuh
@ -3,7 +3,7 @@
 * that supports consuming asynchronous input. This tile scheduler introduces the following arguments:
 *
 * - tiles_per_chunk_m – Specifies the size of an M chunk. Chunks are the granularity at which the
- *   asynchronous input becomes ready. It must be an interger multiple of the size of an M tile.
+ *   asynchronous input becomes ready. It must be an integer multiple of the size of an M tile.
 *
 * - chunk_signals – chunk_signals[i] == 1 indicates that chunk i is ready. Before returning a work
 *   tile, get_current_work() waits for the signal to ensure that the corresponding chunk is ready.
@ -327,7 +327,7 @@ public:
        wait_signal(scheduler_params.chunk_signals + chunk_idx);
      }

-      // An arbirary, non-default id
+      // An arbitrary, non-default id
      constexpr int barrier_id = 8;
      arch::NamedBarrier barrier(NumThreadsPerWarp, barrier_id);
      barrier.arrive_and_wait();
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@ -1006,7 +1006,7 @@ This class does not support ``__members__`` property.)");
  });

 #ifdef USE_NVSHMEM
-  // Intializes the device state in CUmodule so that it’s able to perform
+  // Initializes the device state in CUmodule so that it’s able to perform
  // NVSHMEM operations.
  module.def(
      "_nvshmemx_cumodule_init",
@ -3297,7 +3297,7 @@ Arguments:
            Default is False.

 Attributes:
-    config (NCCLConfig): configures NCCL communicators (only avaiable for
+    config (NCCLConfig): configures NCCL communicators (only available for
            builds using NCCL 2.17+). This can be used to improve
            communication-computation overlap for NCCL kernels by tuning
            available parameters in the config. See
--- a/torch/csrc/distributed/c10d/logger.hpp
+++ b/torch/csrc/distributed/c10d/logger.hpp
@ -10,7 +10,7 @@ namespace c10d {
 // A struct to hold the latest status of the process group.
 struct ProcessGroupStatus {
  // the sequential number of the last collective enqueued into workMetaList_
-  // This is useful for indentifying a rank that has not join a collective
+  // This is useful for identifying a rank that has not join a collective
  // initialized to be -1 to indicate no collective has been enqueued
  int64_t lastEnqueuedSeq{-1};
  // the sequential number of the last collective started as the kernel
--- a/torch/csrc/distributed/c10d/python_comm_hook.h
+++ b/torch/csrc/distributed/c10d/python_comm_hook.h
@ -15,7 +15,7 @@ class TORCH_PYTHON_API PythonCommHook : public CommHookInterface {
  // The state is passed to the hook in runHook method, and it can be used to
  // maintain and update any state information during the execution of the hook.
  // The hook performs user-specified processing and returns a future indicating
-  // asychronous communication of gradients.
+  // asynchronous communication of gradients.
  PythonCommHook(py::object state, py::object hook)
      : state_(std::move(state)), hook_(std::move(hook)) {}

--- a/torch/csrc/distributed/c10d/reducer.cpp
+++ b/torch/csrc/distributed/c10d/reducer.cpp
@ -1245,7 +1245,7 @@ void Reducer::initialize_buckets(
      // patterns when copy_ing grad data in and out of its bucket view.
      // However, numerics remain correct, because the bucket view is the same
      // on either end of the raw allreduce.  bucket_view_in.copy(grad)
-      // tranposes
+      // transposes
      // (+ densifies) to the bucket view's layout, the data is allreduced,
      // then grad.copy_(bucket_view_out) transposes it back to grad's layout.
      //
--- a/torch/csrc/distributed/c10d/reducer.hpp
+++ b/torch/csrc/distributed/c10d/reducer.hpp
@ -564,7 +564,7 @@ class TORCH_API Reducer {
  // Retrieves parameter corresponding to the given VariableIndex.
  at::Tensor& get_param_from_index(size_t index);
  // Python reducer keeps C++ reducer initialized. To remove this flag,
-  // we need to refactor the DDP wrapper's initilization.
+  // we need to refactor the DDP wrapper's initialization.
  bool use_python_reducer_;

  // Cached bucket index to model parameter mapping. Populated after buckets
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h
@ -50,7 +50,7 @@ __device__ __forceinline__ void trap() {
 #if defined(USE_ROCM)
  // abort() calls trap() under the covers. However, on ROCm, the trap is
  // handled differently inside hip runtime. It collects a gpu core dump and
-  // causes linux kernerl to create a core dump of the host application.
+  // causes linux kernel to create a core dump of the host application.
  abort();
 #else
  __trap();
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
@ -56,7 +56,7 @@
    INT_SWITCH_CASE(k_alignment, 8, __VA_ARGS__);                     \
    INT_SWITCH_CASE(k_alignment, 4, __VA_ARGS__);                     \
    default: {                                                        \
-      TORCH_CHECK(false, "Not implemented for aligment=", alignment); \
+      TORCH_CHECK(false, "Not implemented for alignment=", alignment); \
    }                                                                 \
  }

--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp
@ -156,7 +156,7 @@ int IpcChannel::recv_fd() {
      .msg_control = cbuf,
      .msg_controllen = sizeof(cbuf)};

-  // Recieve message on socket_
+  // Receive message on socket_
  TORCH_CHECK(
      recvmsg(socket_, &msg, 0) > 0,
      "Failed to receive fd: ",
--- a/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp
+++ b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp
@ -62,7 +62,7 @@ static NvlMesh getNvlMesh(const std::vector<int>& rankToDeviceIdx) {
 }

 /**
- * Detech topology given a NvlMesh.
+ * Detect topology given a NvlMesh.
 */
 static Topology detectTopology(const NvlMesh nvlMesh, size_t worldSize) {
  if (getCvarBool(TEST_INTRA_NODE_COMM, false)) {
--- a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
@ -101,7 +101,7 @@ void initialize_nvshmem_with_store(
  LOG(INFO) << "NVSHMEM is available, version: " << major << "." << minor;
 }

-// Intializes the device state in CUmodule so that it’s able to perform NVSHMEM
+// Initializes the device state in CUmodule so that it’s able to perform NVSHMEM
 // operations.
 void nvshmemx_cumodule_init(uintptr_t module) {
  auto cumodule = reinterpret_cast<CUmodule>(module);
@ -546,7 +546,7 @@ at::Tensor nvshmem_all_to_all_vdev_2d(
                | c0 | d0 | c1 | d1 | c2 | d2 | c3 | d3 |
        where each `c_i` / `d_i` are slices of the `input` tensor, targeting
        expert `i`, with length indicated by input splits (in
-        `in_out_splits[0]`).  That is, the 2D AllToAllv shuffle achives a
+        `in_out_splits[0]`).  That is, the 2D AllToAllv shuffle achieves a
        transpose from rank-major order at input to expert-major order at
        output.

--- a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh
+++ b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh
@ -14,7 +14,7 @@ void initialize_nvshmem_with_store(
 // Check if NVSHMEM is available
 TORCH_API bool is_nvshmem_available();

-// Intializes the device state in CUmodule so that it’s able to perform NVSHMEM
+// Initializes the device state in CUmodule so that it’s able to perform NVSHMEM
 // operations.
 TORCH_API void nvshmemx_cumodule_init(uintptr_t module);

--- a/torch/csrc/distributed/rpc/agent_utils.h
+++ b/torch/csrc/distributed/rpc/agent_utils.h
@ -24,7 +24,7 @@ TORCH_API std::unordered_map<std::string, worker_id_t> collectCurrentNames(
    const worker_id_t selfId,
    const std::string& selfName);

-// Remove name frmo Store, used in dynamic RPC groups.
+// Remove name from Store, used in dynamic RPC groups.
 // NOTE: This needs to be called with the Dynamic RPC group
 // membership management token held.
 TORCH_API void removeCurrentName(
--- a/torch/csrc/distributed/rpc/py_rref.cpp
+++ b/torch/csrc/distributed/rpc/py_rref.cpp
@ -16,7 +16,7 @@ namespace torch::distributed::rpc {
 namespace {

 py::tuple toPyTuple(const RRefForkData& rrefForkData) {
-  // add GIL as it is contructing a py::object
+  // add GIL as it is constructing a py::object
  pybind11::gil_scoped_acquire ag;
  return py::make_tuple(
      rrefForkData.ownerId_,
--- a/torch/csrc/distributed/rpc/python_rpc_handler.cpp
+++ b/torch/csrc/distributed/rpc/python_rpc_handler.cpp
@ -121,7 +121,7 @@ PythonRpcHandler& PythonRpcHandler::getInstance() {
  // initialization by calling `new PythonRpcHandler()`, inside of which GIL is
  // also required. Static data initialization is thread-safe, so the thread
  // holding the GIL will wait for the other thread to finish static data
-  // initializating before going forward. Because the initialization can't
+  // initializing before going forward. Because the initialization can't
  // proceed without GIL, there is a deadlock. We ask the calling thread to
  // release GIL to avoid this situation.
  TORCH_INTERNAL_ASSERT(!PyGILState_Check());
--- a/torch/csrc/distributed/rpc/rref_context.cpp
+++ b/torch/csrc/distributed/rpc/rref_context.cpp
@ -348,7 +348,7 @@ c10::intrusive_ptr<OwnerRRef> RRefContext::getOrCreateOwnerRRef(
    // here is a plain TensorType, they are not equal relationship:
    // specialized TensorType <: plain TensorType
    //
-    // In RPC we don't care the difference as we ser/de with just the
+    // In RPC we don't care the difference as we ser'de with just the
    // plain TensorType. This is not a issue for UserRRef creation either,
    // since Tensor can only get specialized with a previous run of local
    // JIT function, and we shouldn't preserve the specialized SubTensorType
--- a/torch/csrc/distributed/rpc/rref_context.h
+++ b/torch/csrc/distributed/rpc/rref_context.h
@ -318,7 +318,7 @@ class TORCH_API RRefContext {
  //     RRef is forwarded to the callee as new UserRRefs (if the callee is not
  //     the owner). In this case, we block running the user function until all
  //     UserRRefs are confirmed by the owner.
-  // This contract gurantees that no UserRRefs can be used remotely without
+  // This contract guarantees that no UserRRefs can be used remotely without
  // confirmation. Note that, however, the UserRRef created by rpc.remote can
  // still be passed to local functions as arguments and used there. This is by
  // design, because this feature is especially useful when, say a master node
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp
@ -371,7 +371,7 @@ void TensorPipeAgent::checkAndSetStaticGroup(
      isStaticGroupKey, std::vector<uint8_t>(), isStaticGroupVec);
  std::string returnedVal = std::string(returnedVec.begin(), returnedVec.end());
  // In both cases, the returned value should be the value of isStaticGroupStr,
-  // otherwise there is a discrepency with initialization among one of the
+  // otherwise there is a discrepancy with initialization among one of the
  // members
  TORCH_CHECK(
      returnedVal == isStaticGroupStr,
--- a/torch/csrc/distributed/rpc/tensorpipe_agent.h
+++ b/torch/csrc/distributed/rpc/tensorpipe_agent.h
@ -121,8 +121,8 @@ struct TORCH_API TensorPipeRpcBackendOptions : public RpcBackendOptions {
      deviceMaps[workerName] = deviceMap;
    } else {
      for (auto& entry : deviceMap) {
-        // c10::Device has no default constructor, hence map[device] dosn't work
-        // In C++-17 we can use insert_or_assign.
+        // c10::Device has no default constructor, hence map[device] doesn't
+        // work In C++-17 we can use insert_or_assign.
        auto entryIter = iter->second.find(entry.first);
        if (entryIter == iter->second.end()) {
          iter->second.emplace(entry.first, entry.second);