From d55dc00f8479699f454e91c779b53e2a16cffc21 Mon Sep 17 00:00:00 2001 From: Xuehai Pan Date: Sun, 22 Jun 2025 22:22:34 +0800 Subject: [PATCH] [BE][11/16] fix typos in torch/ (torch/csrc/distributed/) (#156321) Pull Request resolved: https://github.com/pytorch/pytorch/pull/156321 Approved by: https://github.com/jingsh ghstack dependencies: #156313, #156314, #156315, #156316, #156317, #156319 --- .lintrunner.toml | 1 - tools/linter/dictionary.txt | 1 + .../distributed/autograd/engine/dist_engine.h | 2 +- .../rpc_messages/rpc_with_profiling_req.cpp | 2 +- torch/csrc/distributed/c10d/FileStore.cpp | 2 +- .../csrc/distributed/c10d/FlightRecorder.hpp | 2 +- .../distributed/c10d/ProcessGroupGloo.cpp | 2 +- .../csrc/distributed/c10d/ProcessGroupMPI.hpp | 2 +- .../distributed/c10d/ProcessGroupNCCL.cpp | 32 +++++++++---------- .../distributed/c10d/ProcessGroupNCCL.hpp | 4 +-- .../distributed/c10d/TCPStoreLibUvBackend.cpp | 2 +- torch/csrc/distributed/c10d/Utils.hpp | 6 ++-- torch/csrc/distributed/c10d/Work.hpp | 2 +- torch/csrc/distributed/c10d/comm.hpp | 3 +- torch/csrc/distributed/c10d/cuda/AsyncMM.cu | 6 ++-- .../persistent_async_input_scheduler.cuh | 4 +-- torch/csrc/distributed/c10d/init.cpp | 4 +-- torch/csrc/distributed/c10d/logger.hpp | 2 +- .../csrc/distributed/c10d/python_comm_hook.h | 2 +- torch/csrc/distributed/c10d/reducer.cpp | 2 +- torch/csrc/distributed/c10d/reducer.hpp | 2 +- .../c10d/symm_mem/CUDASymmetricMemory-inl.h | 2 +- .../c10d/symm_mem/CUDASymmetricMemoryOps.cu | 2 +- .../symm_mem/CUDASymmetricMemoryUtils.cpp | 2 +- .../c10d/symm_mem/intra_node_comm.cpp | 2 +- .../c10d/symm_mem/nvshmem_extension.cu | 4 +-- .../c10d/symm_mem/nvshmem_extension.cuh | 2 +- torch/csrc/distributed/rpc/agent_utils.h | 2 +- torch/csrc/distributed/rpc/py_rref.cpp | 2 +- .../distributed/rpc/python_rpc_handler.cpp | 2 +- torch/csrc/distributed/rpc/rref_context.cpp | 2 +- torch/csrc/distributed/rpc/rref_context.h | 2 +- .../csrc/distributed/rpc/tensorpipe_agent.cpp | 2 +- torch/csrc/distributed/rpc/tensorpipe_agent.h | 4 +-- 34 files changed, 58 insertions(+), 57 deletions(-) diff --git a/.lintrunner.toml b/.lintrunner.toml index b592caa049a2..33961b8248b3 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -1179,7 +1179,6 @@ exclude_patterns = [ 'torch/utils/**', 'torch/csrc/jit/**', 'torch/csrc/jit/[a-o]*/**', - 'torch/csrc/distributed/**', ] init_command = [ 'python3', diff --git a/tools/linter/dictionary.txt b/tools/linter/dictionary.txt index 47527239ea6d..a3da2299cf23 100644 --- a/tools/linter/dictionary.txt +++ b/tools/linter/dictionary.txt @@ -24,5 +24,6 @@ rebuilt reenable reenabled requestor +ser'de supercedes te diff --git a/torch/csrc/distributed/autograd/engine/dist_engine.h b/torch/csrc/distributed/autograd/engine/dist_engine.h index 362c78fa07b1..7911462307fb 100644 --- a/torch/csrc/distributed/autograd/engine/dist_engine.h +++ b/torch/csrc/distributed/autograd/engine/dist_engine.h @@ -15,7 +15,7 @@ class BackwardPassCleanupGuard; // This is a singleton class responsible for running distributed backward // passes. This engine relies heavily on the vanilla autograd engine and tries -// to re-use it as much as possible. This class is mostly responsible for the +// to reuse it as much as possible. This class is mostly responsible for the // distributed aspects of autograd and tries to hook into the autograd engine // where convenient. diff --git a/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_req.cpp b/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_req.cpp index 19db3671c7de..52e3465f85ab 100644 --- a/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_req.cpp +++ b/torch/csrc/distributed/autograd/rpc_messages/rpc_with_profiling_req.cpp @@ -45,7 +45,7 @@ RpcWithProfilingReq::RpcWithProfilingReq( tensors_(std::move(tensors)), profilerConfig_(std::move(profilerConfig)), profilingKeyId_(profilingKeyId) { - TORCH_INTERNAL_ASSERT(wrappedRpc_ != nullptr, "wrappedRpc cant be null"); + TORCH_INTERNAL_ASSERT(wrappedRpc_ != nullptr, "wrappedRpc can't be null"); } rpc::MessageType RpcWithProfilingReq::wrappedMessageType() const { diff --git a/torch/csrc/distributed/c10d/FileStore.cpp b/torch/csrc/distributed/c10d/FileStore.cpp index 6fcbd4ad86f6..862c983d9e05 100644 --- a/torch/csrc/distributed/c10d/FileStore.cpp +++ b/torch/csrc/distributed/c10d/FileStore.cpp @@ -323,7 +323,7 @@ FileStore::~FileStore() { auto numFinishedWorker = addHelper(cleanupKey_, 1); auto refCount = addHelper(refCountKey_, -1); // The last worker cleans up the file. If numWorkers was not initialized to - // a specific postive value (i.e. meaning that there was not a fixed number + // a specific positive value (i.e. meaning that there was not a fixed number // of workers), we don't attempt to clean. // Clean up the file if number of references is 0. if (refCount == 0 && numWorkers_ >= 0 && numFinishedWorker >= numWorkers_) { diff --git a/torch/csrc/distributed/c10d/FlightRecorder.hpp b/torch/csrc/distributed/c10d/FlightRecorder.hpp index f8b25fd2b827..29e920564710 100644 --- a/torch/csrc/distributed/c10d/FlightRecorder.hpp +++ b/torch/csrc/distributed/c10d/FlightRecorder.hpp @@ -145,7 +145,7 @@ struct FlightRecorder { std::optional time_discovered_started_; // timestamp when our CPU threads discovered that the kernel completed. - // will always be _after_ it actually complated, and can be the same time + // will always be _after_ it actually completed, and can be the same time // as the discovery of the start if the watchdog thread is stuck on CUDA // APIs std::optional time_discovered_completed_; diff --git a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp index 421b35989de4..087c2831b4ed 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupGloo.cpp @@ -965,7 +965,7 @@ c10::intrusive_ptr ProcessGroupGloo::allreduce_sparse( const AllreduceOptions& opts) { // all reduce sparse calls into default allreduce which // implemented with all_gathering indices and values - // we do ths we do not have a native cuda implementation + // we do this we do not have a native cuda implementation return allreduce(inputs, opts); } diff --git a/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp b/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp index a2dc53884326..33bb696cf2a8 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupMPI.hpp @@ -65,7 +65,7 @@ struct WorkEntry { // That is, The process may be multi-threaded, and multiple threads may make // MPI calls, but only one at a time: MPI calls are not made concurrently from // two distinct threads (all MPI calls are serialized). However, with -// MPI_THREAD_SERIALIZED, ProcessGroupMPI will only support a singe process +// MPI_THREAD_SERIALIZED, ProcessGroupMPI will only support a single process // group. In other words, no more than 1 process group can be created globally. // // If you would like to use multiple ProcessGroupMPI, it requires your MPI diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp index 8e881d3f2617..a71921110fca 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp @@ -1423,7 +1423,7 @@ void ProcessGroupNCCL::abortCommsFromMap( bool ProcessGroupNCCL::abortComms( const std::optional& abortReason) { // Remove record from global ncclCommMemPoolMapMutex before aboarting, - // so that a new cache segment would not register to already aborded + // so that a new cache segment would not register to already aborted // communicators. Note that ncclCommMemPoolMap is a global container which may // contain other PG's communicators, thus we need to only erase communicators // for the current PG. @@ -1451,9 +1451,9 @@ void ProcessGroupNCCL::abort() { terminateProcessGroup_.store(true); watchdog_->notify(); - // lauch abort asynchrounously and wait for it to complete or timeout + // launch abort asynchronously and wait for it to complete or timeout LOG(INFO) << logPrefix() - << "Launching ProcessGroupNCCL abort asynchrounously."; + << "Launching ProcessGroupNCCL abort asynchronously."; std::future fut = std::async(std::launch::async, [this]() { return this->abortComms(); }); @@ -1655,7 +1655,7 @@ std::string ProcessGroupNCCL::HeartbeatMonitor::getNCCLWatchdogTimeoutExitMsg( void ProcessGroupNCCL::HeartbeatMonitor::setLastWorkListUpdateTime( std::chrono::time_point time) { - // We intentially let the race condition to happen but this is ok + // We intentionally let the race condition to happen but this is ok // as long as we update the time, we know we are making progress. lastWorkListUpdateTime_ = time; } @@ -1761,7 +1761,7 @@ void ProcessGroupNCCL::HeartbeatMonitor::runLoop() { // 1. The current rank is the first to observe a timeout in watchdog. // (shouldDump_ was set to true by the watchdog thread). // 2. Other ranks detected the timeout and signal the current rank to - // dump. In addtion, monitor threads will dump if watchdog threads has no + // dump. In addition, monitor threads will dump if watchdog threads has no // heartbeat or dumpPipe is not empty. if (shouldDump_.load()) { errorMsg = getNCCLWatchdogTimeoutErrorMsg("this local rank"); @@ -3030,7 +3030,7 @@ std::shared_ptr ProcessGroupNCCL::initNCCLComm( bool useScalableInit = false; // (nranks / nroots) == 128 was the default NCCL recommended - // accoring to + // according to // https://github.com/pytorch/pytorch/pull/136789#discussion_r1779171615. auto ranksPerRoot = getCvarInt(TORCH_NCCL_RANKS_PER_ROOT, 128); #if defined(NCCL_HAS_INIT_RANK_SCALABLE) && defined(NCCL_HAS_CONFIG) @@ -3327,7 +3327,7 @@ c10::intrusive_ptr ProcessGroupNCCL::initWork( // - initially, moved record() into workEnqueue(), but found that makes it // hard to get access to profilingTitle, // inputs, and outputs for metadata recording, and we don't want to attach - // these objects to the Work becuase it has implications for keeping those + // these objects to the Work because it has implications for keeping those // tensors alive longer and adds overhead when copying Work objects // between threads r->trace_id_ = FlightRecorderCUDA::get()->record( @@ -3442,7 +3442,7 @@ void ProcessGroupNCCL::startCoalescing() { // ops from a coalesce group into the flight recorder, we want to have the // same seq_ for those ops and its 'endCoalescing' op. Hence we bump during // start, which has one minor downside- we burn a seq_ if someone ever does a - // 'start' and 'end' coalescing region without doing an operation inbetween. + // 'start' and 'end' coalescing region without doing an operation in between. coalescedDevice_.set_index(-1); coalescedComm_ = nullptr; @@ -3462,7 +3462,7 @@ c10::intrusive_ptr ProcessGroupNCCL::endCoalescing(OpType optype) { } TORCH_CHECK( coalescedDevice_.index() >= 0, - "Somthing went wrong. Did you call end_coalescing before start_coalescing?"); + "Something went wrong. Did you call end_coalescing before start_coalescing?"); // `coalescedComm_` should have same set of comms across collectives auto comm = coalescedComm_; @@ -3618,7 +3618,7 @@ c10::intrusive_ptr ProcessGroupNCCL::collective( device, rank_, opType, false, profilingTitle, inputs, outputs, enqueue); if (coalescing_state_) { // When coalescing, we record events per op that lack timing/state - // information becuase there is no 'work' associated with them, and then + // information because there is no 'work' associated with them, and then // later in endCoalescing we record a 'coalesced' Work which has // timing/state updates via watchdog thread, but lacks op metadata such as // input/output sizes and profilingTitle per-op in the group. @@ -3781,7 +3781,7 @@ c10::intrusive_ptr ProcessGroupNCCL::collectiveCoalesced( // collective so there is no flight record and we increment seqCollective_ and // op_id_ together. Compare this to startCoalescing/endCoalescing flow where // we increment either seqP2P_ or seqCollective_ once per group and increment - // op_id_ once per indvidual operation within the group + // op_id_ once per individual operation within the group op_id_++; const auto key = getKeyFromDevice(device); @@ -4016,7 +4016,7 @@ c10::intrusive_ptr ProcessGroupNCCL::pointToPoint( c10::intrusive_ptr work; if (coalescing_state_) { // When coalescing, we record events per op that lack timing/state - // information becuase there is no 'work' associated with them, and then + // information because there is no 'work' associated with them, and then // later in endCoalescing we record a 'coalesced' Work which has // timing/state updates via watchdog thread, but lacks op metadata such as // input/output sizes and profilingTitle per-op in the group. @@ -4397,7 +4397,7 @@ c10::intrusive_ptr ProcessGroupNCCL::allreduce_coalesced( std::make_tuple( static_cast(seqCollective_) + 1, false), // seq + 1 to match collective and assume only one collective - // in coalesed range + // in coalesced range std::make_tuple(pg_uid_, pg_desc_), // PG name tuple tensors, // inputTensors tensors, // outputTensors @@ -4694,7 +4694,7 @@ c10::intrusive_ptr ProcessGroupNCCL::allgather( // User-facing outputTensors should be held by the user until after // waiting on work_, or the call makes no sense. We do a stashing here // in case user doesn't hold the outputTensors in downstream code, - // which can cause an early recyle by the CachingAllocator, which can + // which can cause an early recycle by the CachingAllocator, which can // lead to segfault or data corruption. if (opts.asyncOp) { work->stashed_for_allocator_safety_->stash(outputTensors_); @@ -4742,7 +4742,7 @@ c10::intrusive_ptr ProcessGroupNCCL::allgather_into_tensor_coalesced( std::make_tuple( static_cast(seqCollective_) + 1, false), // seq + 1 to match collective and assume only one collective - // in coalesed range + // in coalesced range std::make_tuple(pg_uid_, pg_desc_), // PG name tuple inputs, // inputTensors outputs, // outputTensors @@ -4956,7 +4956,7 @@ c10::intrusive_ptr ProcessGroupNCCL::reduce_scatter_tensor_coalesced( std::make_tuple( static_cast(seqCollective_) + 1, false), // seq + 1 to match collective and assume only one collective - // in coalesed range + // in coalesced range std::make_tuple(pg_uid_, pg_desc_), // PG name tuple inputs, // inputTensors outputs, // outputTensors diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp index 795cc2db8914..1274af22492c 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp +++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp @@ -1291,7 +1291,7 @@ class TORCH_API ProcessGroupNCCL : public Backend { // communication, the key will be "1:2" on both processes. Note: this is for // the scenario where there is only 1 GPU per process. When it comes to // multiple GPUs per process, this part may need to redesigned. - // TODO: we probably need a separte map for P2P comms + // TODO: we probably need a separate map for P2P comms std::unordered_map> devNCCLCommMap_; // The NCCL communicators currently in process of being initialized. @@ -1316,7 +1316,7 @@ class TORCH_API ProcessGroupNCCL : public Backend { std::atomic hasPendingHooks_{}; // This is the signal from watchdog threads to indicate whether the monitor - // thread should dump. Making it static so that it is accessiable from all the + // thread should dump. Making it static so that it is accessible from all the // PGs. With this flag, monitor thread would dump debug info under any one of // the three conditions: // diff --git a/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp b/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp index 25223fca442c..52354de93edf 100644 --- a/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp +++ b/torch/csrc/distributed/c10d/TCPStoreLibUvBackend.cpp @@ -397,7 +397,7 @@ class WriterPayload : public c10::intrusive_ptr_target { void registeredInLoop() { /* This refcount increment must be matched by a reclaim call. - Call this method after sucessfully scheduling this handle with a loop. + Call this method after successfully scheduling this handle with a loop. */ at::raw::intrusive_ptr::incref(this); } diff --git a/torch/csrc/distributed/c10d/Utils.hpp b/torch/csrc/distributed/c10d/Utils.hpp index 411b23a783cc..03bd6ef3cafd 100644 --- a/torch/csrc/distributed/c10d/Utils.hpp +++ b/torch/csrc/distributed/c10d/Utils.hpp @@ -573,9 +573,9 @@ using SizeType = uint64_t; // (https://stackoverflow.com/a/20295079), and thus `errno` should really only // be inspected if an error occurred. // -// `success_cond` is an expression used to check if an error has happend. So for -// `fork()`, we can use `SYSCHECK(pid = fork(), pid != -1)`. The function output -// is stored in variable `__output` and may be used in `success_cond`. +// `success_cond` is an expression used to check if an error has happened. So +// for `fork()`, we can use `SYSCHECK(pid = fork(), pid != -1)`. The function +// output is stored in variable `__output` and may be used in `success_cond`. #ifdef _WIN32 #define SYSCHECK(expr, success_cond) \ while (true) { \ diff --git a/torch/csrc/distributed/c10d/Work.hpp b/torch/csrc/distributed/c10d/Work.hpp index 5fd6c6c73788..e9e785a9c643 100644 --- a/torch/csrc/distributed/c10d/Work.hpp +++ b/torch/csrc/distributed/c10d/Work.hpp @@ -118,7 +118,7 @@ class TORCH_API Work : public torch::CustomClassHolder { // Get a Future object that would be marked as either success or failure // This API can be used by the user to track the completion of the work - // and hanlde the exception if any. + // and handle the exception if any. virtual c10::intrusive_ptr getFutureResult(); virtual float getDuration() const; diff --git a/torch/csrc/distributed/c10d/comm.hpp b/torch/csrc/distributed/c10d/comm.hpp index 6f9203e21434..599c1709c4df 100644 --- a/torch/csrc/distributed/c10d/comm.hpp +++ b/torch/csrc/distributed/c10d/comm.hpp @@ -67,7 +67,8 @@ class TORCH_API GradBucket { return parameters_; } - // Returns whther this bucket is the last bucket to allreduce in an iteration. + // Returns whether this bucket is the last bucket to allreduce in an + // iteration. bool isLast() const { return index_ == bucket_count_ - 1; } diff --git a/torch/csrc/distributed/c10d/cuda/AsyncMM.cu b/torch/csrc/distributed/c10d/cuda/AsyncMM.cu index a01fec5f73ff..3b7effb3a7d6 100644 --- a/torch/csrc/distributed/c10d/cuda/AsyncMM.cu +++ b/torch/csrc/distributed/c10d/cuda/AsyncMM.cu @@ -5,7 +5,7 @@ #include #include -// Two warninngs in Cutlass included header files +// Two warnings in Cutlass included header files C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used") C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter") @@ -163,7 +163,7 @@ at::Tensor async_input_mm_impl( TORCH_CHECK( M % num_chunks_M == 0, - "async_input_mm: `a.shape(0)` must be an interger multiple of `a_chunk_signals.numel()`"); + "async_input_mm: `a.shape(0)` must be an integer multiple of `a_chunk_signals.numel()`"); size_t chunk_size_M = M / num_chunks_M; size_t tile_size_M = cute::get<0>(TileShape_MNK{}); @@ -248,7 +248,7 @@ at::Tensor async_input_mm_out( }); #else TORCH_CHECK( - false, "async_input_mm is not currenlty supported on your device"); + false, "async_input_mm is not currently supported on your device"); #endif return out; } diff --git a/torch/csrc/distributed/c10d/cuda/cutlass/gemm/kernel/persistent_async_input_scheduler.cuh b/torch/csrc/distributed/c10d/cuda/cutlass/gemm/kernel/persistent_async_input_scheduler.cuh index 3c8ef2a052a0..0610a862f158 100644 --- a/torch/csrc/distributed/c10d/cuda/cutlass/gemm/kernel/persistent_async_input_scheduler.cuh +++ b/torch/csrc/distributed/c10d/cuda/cutlass/gemm/kernel/persistent_async_input_scheduler.cuh @@ -3,7 +3,7 @@ * that supports consuming asynchronous input. This tile scheduler introduces the following arguments: * * - tiles_per_chunk_m – Specifies the size of an M chunk. Chunks are the granularity at which the - * asynchronous input becomes ready. It must be an interger multiple of the size of an M tile. + * asynchronous input becomes ready. It must be an integer multiple of the size of an M tile. * * - chunk_signals – chunk_signals[i] == 1 indicates that chunk i is ready. Before returning a work * tile, get_current_work() waits for the signal to ensure that the corresponding chunk is ready. @@ -327,7 +327,7 @@ public: wait_signal(scheduler_params.chunk_signals + chunk_idx); } - // An arbirary, non-default id + // An arbitrary, non-default id constexpr int barrier_id = 8; arch::NamedBarrier barrier(NumThreadsPerWarp, barrier_id); barrier.arrive_and_wait(); diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index fddc374cd637..695715a332f5 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -1006,7 +1006,7 @@ This class does not support ``__members__`` property.)"); }); #ifdef USE_NVSHMEM - // Intializes the device state in CUmodule so that it’s able to perform + // Initializes the device state in CUmodule so that it’s able to perform // NVSHMEM operations. module.def( "_nvshmemx_cumodule_init", @@ -3297,7 +3297,7 @@ Arguments: Default is False. Attributes: - config (NCCLConfig): configures NCCL communicators (only avaiable for + config (NCCLConfig): configures NCCL communicators (only available for builds using NCCL 2.17+). This can be used to improve communication-computation overlap for NCCL kernels by tuning available parameters in the config. See diff --git a/torch/csrc/distributed/c10d/logger.hpp b/torch/csrc/distributed/c10d/logger.hpp index fb5c044f5d80..cd562af7473a 100644 --- a/torch/csrc/distributed/c10d/logger.hpp +++ b/torch/csrc/distributed/c10d/logger.hpp @@ -10,7 +10,7 @@ namespace c10d { // A struct to hold the latest status of the process group. struct ProcessGroupStatus { // the sequential number of the last collective enqueued into workMetaList_ - // This is useful for indentifying a rank that has not join a collective + // This is useful for identifying a rank that has not join a collective // initialized to be -1 to indicate no collective has been enqueued int64_t lastEnqueuedSeq{-1}; // the sequential number of the last collective started as the kernel diff --git a/torch/csrc/distributed/c10d/python_comm_hook.h b/torch/csrc/distributed/c10d/python_comm_hook.h index 48ad7cefae94..a63f03fbf8c1 100644 --- a/torch/csrc/distributed/c10d/python_comm_hook.h +++ b/torch/csrc/distributed/c10d/python_comm_hook.h @@ -15,7 +15,7 @@ class TORCH_PYTHON_API PythonCommHook : public CommHookInterface { // The state is passed to the hook in runHook method, and it can be used to // maintain and update any state information during the execution of the hook. // The hook performs user-specified processing and returns a future indicating - // asychronous communication of gradients. + // asynchronous communication of gradients. PythonCommHook(py::object state, py::object hook) : state_(std::move(state)), hook_(std::move(hook)) {} diff --git a/torch/csrc/distributed/c10d/reducer.cpp b/torch/csrc/distributed/c10d/reducer.cpp index 9b2cc9f5eedf..1e9e7006a663 100644 --- a/torch/csrc/distributed/c10d/reducer.cpp +++ b/torch/csrc/distributed/c10d/reducer.cpp @@ -1245,7 +1245,7 @@ void Reducer::initialize_buckets( // patterns when copy_ing grad data in and out of its bucket view. // However, numerics remain correct, because the bucket view is the same // on either end of the raw allreduce. bucket_view_in.copy(grad) - // tranposes + // transposes // (+ densifies) to the bucket view's layout, the data is allreduced, // then grad.copy_(bucket_view_out) transposes it back to grad's layout. // diff --git a/torch/csrc/distributed/c10d/reducer.hpp b/torch/csrc/distributed/c10d/reducer.hpp index 43536bd515df..6707975d38ac 100644 --- a/torch/csrc/distributed/c10d/reducer.hpp +++ b/torch/csrc/distributed/c10d/reducer.hpp @@ -564,7 +564,7 @@ class TORCH_API Reducer { // Retrieves parameter corresponding to the given VariableIndex. at::Tensor& get_param_from_index(size_t index); // Python reducer keeps C++ reducer initialized. To remove this flag, - // we need to refactor the DDP wrapper's initilization. + // we need to refactor the DDP wrapper's initialization. bool use_python_reducer_; // Cached bucket index to model parameter mapping. Populated after buckets diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h index ef2a712db344..bf47a3bdc1a2 100644 --- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h +++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory-inl.h @@ -50,7 +50,7 @@ __device__ __forceinline__ void trap() { #if defined(USE_ROCM) // abort() calls trap() under the covers. However, on ROCm, the trap is // handled differently inside hip runtime. It collects a gpu core dump and - // causes linux kernerl to create a core dump of the host application. + // causes linux kernel to create a core dump of the host application. abort(); #else __trap(); diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu index 353770a42fec..698c6cffd036 100644 --- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu +++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu @@ -56,7 +56,7 @@ INT_SWITCH_CASE(k_alignment, 8, __VA_ARGS__); \ INT_SWITCH_CASE(k_alignment, 4, __VA_ARGS__); \ default: { \ - TORCH_CHECK(false, "Not implemented for aligment=", alignment); \ + TORCH_CHECK(false, "Not implemented for alignment=", alignment); \ } \ } diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp index f13941ba5a27..225304faca65 100644 --- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp +++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryUtils.cpp @@ -156,7 +156,7 @@ int IpcChannel::recv_fd() { .msg_control = cbuf, .msg_controllen = sizeof(cbuf)}; - // Recieve message on socket_ + // Receive message on socket_ TORCH_CHECK( recvmsg(socket_, &msg, 0) > 0, "Failed to receive fd: ", diff --git a/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp index b11dfa07de3c..0d53d100cee7 100644 --- a/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp +++ b/torch/csrc/distributed/c10d/symm_mem/intra_node_comm.cpp @@ -62,7 +62,7 @@ static NvlMesh getNvlMesh(const std::vector& rankToDeviceIdx) { } /** - * Detech topology given a NvlMesh. + * Detect topology given a NvlMesh. */ static Topology detectTopology(const NvlMesh nvlMesh, size_t worldSize) { if (getCvarBool(TEST_INTRA_NODE_COMM, false)) { diff --git a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu index 5b46dec53f7d..672663303fec 100644 --- a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu +++ b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu @@ -101,7 +101,7 @@ void initialize_nvshmem_with_store( LOG(INFO) << "NVSHMEM is available, version: " << major << "." << minor; } -// Intializes the device state in CUmodule so that it’s able to perform NVSHMEM +// Initializes the device state in CUmodule so that it’s able to perform NVSHMEM // operations. void nvshmemx_cumodule_init(uintptr_t module) { auto cumodule = reinterpret_cast(module); @@ -546,7 +546,7 @@ at::Tensor nvshmem_all_to_all_vdev_2d( | c0 | d0 | c1 | d1 | c2 | d2 | c3 | d3 | where each `c_i` / `d_i` are slices of the `input` tensor, targeting expert `i`, with length indicated by input splits (in - `in_out_splits[0]`). That is, the 2D AllToAllv shuffle achives a + `in_out_splits[0]`). That is, the 2D AllToAllv shuffle achieves a transpose from rank-major order at input to expert-major order at output. diff --git a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh index 634a75ef9903..fd6e0a38492c 100644 --- a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh +++ b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh @@ -14,7 +14,7 @@ void initialize_nvshmem_with_store( // Check if NVSHMEM is available TORCH_API bool is_nvshmem_available(); -// Intializes the device state in CUmodule so that it’s able to perform NVSHMEM +// Initializes the device state in CUmodule so that it’s able to perform NVSHMEM // operations. TORCH_API void nvshmemx_cumodule_init(uintptr_t module); diff --git a/torch/csrc/distributed/rpc/agent_utils.h b/torch/csrc/distributed/rpc/agent_utils.h index 016f6110e13e..8e403bcb6912 100644 --- a/torch/csrc/distributed/rpc/agent_utils.h +++ b/torch/csrc/distributed/rpc/agent_utils.h @@ -24,7 +24,7 @@ TORCH_API std::unordered_map collectCurrentNames( const worker_id_t selfId, const std::string& selfName); -// Remove name frmo Store, used in dynamic RPC groups. +// Remove name from Store, used in dynamic RPC groups. // NOTE: This needs to be called with the Dynamic RPC group // membership management token held. TORCH_API void removeCurrentName( diff --git a/torch/csrc/distributed/rpc/py_rref.cpp b/torch/csrc/distributed/rpc/py_rref.cpp index 8559254de851..f7bc517f41c5 100644 --- a/torch/csrc/distributed/rpc/py_rref.cpp +++ b/torch/csrc/distributed/rpc/py_rref.cpp @@ -16,7 +16,7 @@ namespace torch::distributed::rpc { namespace { py::tuple toPyTuple(const RRefForkData& rrefForkData) { - // add GIL as it is contructing a py::object + // add GIL as it is constructing a py::object pybind11::gil_scoped_acquire ag; return py::make_tuple( rrefForkData.ownerId_, diff --git a/torch/csrc/distributed/rpc/python_rpc_handler.cpp b/torch/csrc/distributed/rpc/python_rpc_handler.cpp index 8a2acc82626b..de5cd0540a45 100644 --- a/torch/csrc/distributed/rpc/python_rpc_handler.cpp +++ b/torch/csrc/distributed/rpc/python_rpc_handler.cpp @@ -121,7 +121,7 @@ PythonRpcHandler& PythonRpcHandler::getInstance() { // initialization by calling `new PythonRpcHandler()`, inside of which GIL is // also required. Static data initialization is thread-safe, so the thread // holding the GIL will wait for the other thread to finish static data - // initializating before going forward. Because the initialization can't + // initializing before going forward. Because the initialization can't // proceed without GIL, there is a deadlock. We ask the calling thread to // release GIL to avoid this situation. TORCH_INTERNAL_ASSERT(!PyGILState_Check()); diff --git a/torch/csrc/distributed/rpc/rref_context.cpp b/torch/csrc/distributed/rpc/rref_context.cpp index 1022d6ff97d7..fa26c1849dde 100644 --- a/torch/csrc/distributed/rpc/rref_context.cpp +++ b/torch/csrc/distributed/rpc/rref_context.cpp @@ -348,7 +348,7 @@ c10::intrusive_ptr RRefContext::getOrCreateOwnerRRef( // here is a plain TensorType, they are not equal relationship: // specialized TensorType <: plain TensorType // - // In RPC we don't care the difference as we ser/de with just the + // In RPC we don't care the difference as we ser'de with just the // plain TensorType. This is not a issue for UserRRef creation either, // since Tensor can only get specialized with a previous run of local // JIT function, and we shouldn't preserve the specialized SubTensorType diff --git a/torch/csrc/distributed/rpc/rref_context.h b/torch/csrc/distributed/rpc/rref_context.h index 3282e8c0e108..ce3b71580ab6 100644 --- a/torch/csrc/distributed/rpc/rref_context.h +++ b/torch/csrc/distributed/rpc/rref_context.h @@ -318,7 +318,7 @@ class TORCH_API RRefContext { // RRef is forwarded to the callee as new UserRRefs (if the callee is not // the owner). In this case, we block running the user function until all // UserRRefs are confirmed by the owner. - // This contract gurantees that no UserRRefs can be used remotely without + // This contract guarantees that no UserRRefs can be used remotely without // confirmation. Note that, however, the UserRRef created by rpc.remote can // still be passed to local functions as arguments and used there. This is by // design, because this feature is especially useful when, say a master node diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp index 9801a0327ddf..94fdd3c036ce 100644 --- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp +++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp @@ -371,7 +371,7 @@ void TensorPipeAgent::checkAndSetStaticGroup( isStaticGroupKey, std::vector(), isStaticGroupVec); std::string returnedVal = std::string(returnedVec.begin(), returnedVec.end()); // In both cases, the returned value should be the value of isStaticGroupStr, - // otherwise there is a discrepency with initialization among one of the + // otherwise there is a discrepancy with initialization among one of the // members TORCH_CHECK( returnedVal == isStaticGroupStr, diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.h b/torch/csrc/distributed/rpc/tensorpipe_agent.h index aaa2e9699e4e..adce40568402 100644 --- a/torch/csrc/distributed/rpc/tensorpipe_agent.h +++ b/torch/csrc/distributed/rpc/tensorpipe_agent.h @@ -121,8 +121,8 @@ struct TORCH_API TensorPipeRpcBackendOptions : public RpcBackendOptions { deviceMaps[workerName] = deviceMap; } else { for (auto& entry : deviceMap) { - // c10::Device has no default constructor, hence map[device] dosn't work - // In C++-17 we can use insert_or_assign. + // c10::Device has no default constructor, hence map[device] doesn't + // work In C++-17 we can use insert_or_assign. auto entryIter = iter->second.find(entry.first); if (entryIter == iter->second.end()) { iter->second.emplace(entry.first, entry.second);