[BE][11/16] fix typos in torch/ (torch/csrc/distributed/) (#156321)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/156321
Approved by: https://github.com/jingsh
ghstack dependencies: #156313, #156314, #156315, #156316, #156317, #156319
This commit is contained in:
Xuehai Pan
2025-06-22 22:22:34 +08:00
committed by PyTorch MergeBot
parent 5b210bb3a6
commit d55dc00f84
34 changed files with 58 additions and 57 deletions

View File

@ -1179,7 +1179,6 @@ exclude_patterns = [
'torch/utils/**',
'torch/csrc/jit/**',
'torch/csrc/jit/[a-o]*/**',
'torch/csrc/distributed/**',
]
init_command = [
'python3',

View File

@ -24,5 +24,6 @@ rebuilt
reenable
reenabled
requestor
ser'de
supercedes
te

View File

@ -15,7 +15,7 @@ class BackwardPassCleanupGuard;
// This is a singleton class responsible for running distributed backward
// passes. This engine relies heavily on the vanilla autograd engine and tries
// to re-use it as much as possible. This class is mostly responsible for the
// to reuse it as much as possible. This class is mostly responsible for the
// distributed aspects of autograd and tries to hook into the autograd engine
// where convenient.

View File

@ -45,7 +45,7 @@ RpcWithProfilingReq::RpcWithProfilingReq(
tensors_(std::move(tensors)),
profilerConfig_(std::move(profilerConfig)),
profilingKeyId_(profilingKeyId) {
TORCH_INTERNAL_ASSERT(wrappedRpc_ != nullptr, "wrappedRpc cant be null");
TORCH_INTERNAL_ASSERT(wrappedRpc_ != nullptr, "wrappedRpc can't be null");
}
rpc::MessageType RpcWithProfilingReq::wrappedMessageType() const {

View File

@ -323,7 +323,7 @@ FileStore::~FileStore() {
auto numFinishedWorker = addHelper(cleanupKey_, 1);
auto refCount = addHelper(refCountKey_, -1);
// The last worker cleans up the file. If numWorkers was not initialized to
// a specific postive value (i.e. meaning that there was not a fixed number
// a specific positive value (i.e. meaning that there was not a fixed number
// of workers), we don't attempt to clean.
// Clean up the file if number of references is 0.
if (refCount == 0 && numWorkers_ >= 0 && numFinishedWorker >= numWorkers_) {

View File

@ -145,7 +145,7 @@ struct FlightRecorder {
std::optional<c10::time_t> time_discovered_started_;
// timestamp when our CPU threads discovered that the kernel completed.
// will always be _after_ it actually complated, and can be the same time
// will always be _after_ it actually completed, and can be the same time
// as the discovery of the start if the watchdog thread is stuck on CUDA
// APIs
std::optional<c10::time_t> time_discovered_completed_;

View File

@ -965,7 +965,7 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::allreduce_sparse(
const AllreduceOptions& opts) {
// all reduce sparse calls into default allreduce which
// implemented with all_gathering indices and values
// we do ths we do not have a native cuda implementation
// we do this we do not have a native cuda implementation
return allreduce(inputs, opts);
}

View File

@ -65,7 +65,7 @@ struct WorkEntry {
// That is, The process may be multi-threaded, and multiple threads may make
// MPI calls, but only one at a time: MPI calls are not made concurrently from
// two distinct threads (all MPI calls are serialized). However, with
// MPI_THREAD_SERIALIZED, ProcessGroupMPI will only support a singe process
// MPI_THREAD_SERIALIZED, ProcessGroupMPI will only support a single process
// group. In other words, no more than 1 process group can be created globally.
//
// If you would like to use multiple ProcessGroupMPI, it requires your MPI

View File

@ -1423,7 +1423,7 @@ void ProcessGroupNCCL::abortCommsFromMap(
bool ProcessGroupNCCL::abortComms(
const std::optional<std::string>& abortReason) {
// Remove record from global ncclCommMemPoolMapMutex before aboarting,
// so that a new cache segment would not register to already aborded
// so that a new cache segment would not register to already aborted
// communicators. Note that ncclCommMemPoolMap is a global container which may
// contain other PG's communicators, thus we need to only erase communicators
// for the current PG.
@ -1451,9 +1451,9 @@ void ProcessGroupNCCL::abort() {
terminateProcessGroup_.store(true);
watchdog_->notify();
// lauch abort asynchrounously and wait for it to complete or timeout
// launch abort asynchronously and wait for it to complete or timeout
LOG(INFO) << logPrefix()
<< "Launching ProcessGroupNCCL abort asynchrounously.";
<< "Launching ProcessGroupNCCL abort asynchronously.";
std::future<bool> fut =
std::async(std::launch::async, [this]() { return this->abortComms(); });
@ -1655,7 +1655,7 @@ std::string ProcessGroupNCCL::HeartbeatMonitor::getNCCLWatchdogTimeoutExitMsg(
void ProcessGroupNCCL::HeartbeatMonitor::setLastWorkListUpdateTime(
std::chrono::time_point<std::chrono::steady_clock> time) {
// We intentially let the race condition to happen but this is ok
// We intentionally let the race condition to happen but this is ok
// as long as we update the time, we know we are making progress.
lastWorkListUpdateTime_ = time;
}
@ -1761,7 +1761,7 @@ void ProcessGroupNCCL::HeartbeatMonitor::runLoop() {
// 1. The current rank is the first to observe a timeout in watchdog.
// (shouldDump_ was set to true by the watchdog thread).
// 2. Other ranks detected the timeout and signal the current rank to
// dump. In addtion, monitor threads will dump if watchdog threads has no
// dump. In addition, monitor threads will dump if watchdog threads has no
// heartbeat or dumpPipe is not empty.
if (shouldDump_.load()) {
errorMsg = getNCCLWatchdogTimeoutErrorMsg("this local rank");
@ -3030,7 +3030,7 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::initNCCLComm(
bool useScalableInit = false;
// (nranks / nroots) == 128 was the default NCCL recommended
// accoring to
// according to
// https://github.com/pytorch/pytorch/pull/136789#discussion_r1779171615.
auto ranksPerRoot = getCvarInt(TORCH_NCCL_RANKS_PER_ROOT, 128);
#if defined(NCCL_HAS_INIT_RANK_SCALABLE) && defined(NCCL_HAS_CONFIG)
@ -3327,7 +3327,7 @@ c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> ProcessGroupNCCL::initWork(
// - initially, moved record() into workEnqueue(), but found that makes it
// hard to get access to profilingTitle,
// inputs, and outputs for metadata recording, and we don't want to attach
// these objects to the Work becuase it has implications for keeping those
// these objects to the Work because it has implications for keeping those
// tensors alive longer and adds overhead when copying Work objects
// between threads
r->trace_id_ = FlightRecorderCUDA::get()->record(
@ -3442,7 +3442,7 @@ void ProcessGroupNCCL::startCoalescing() {
// ops from a coalesce group into the flight recorder, we want to have the
// same seq_ for those ops and its 'endCoalescing' op. Hence we bump during
// start, which has one minor downside- we burn a seq_ if someone ever does a
// 'start' and 'end' coalescing region without doing an operation inbetween.
// 'start' and 'end' coalescing region without doing an operation in between.
coalescedDevice_.set_index(-1);
coalescedComm_ = nullptr;
@ -3462,7 +3462,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::endCoalescing(OpType optype) {
}
TORCH_CHECK(
coalescedDevice_.index() >= 0,
"Somthing went wrong. Did you call end_coalescing before start_coalescing?");
"Something went wrong. Did you call end_coalescing before start_coalescing?");
// `coalescedComm_` should have same set of comms across collectives
auto comm = coalescedComm_;
@ -3618,7 +3618,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
device, rank_, opType, false, profilingTitle, inputs, outputs, enqueue);
if (coalescing_state_) {
// When coalescing, we record events per op that lack timing/state
// information becuase there is no 'work' associated with them, and then
// information because there is no 'work' associated with them, and then
// later in endCoalescing we record a 'coalesced' Work which has
// timing/state updates via watchdog thread, but lacks op metadata such as
// input/output sizes and profilingTitle per-op in the group.
@ -3781,7 +3781,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collectiveCoalesced(
// collective so there is no flight record and we increment seqCollective_ and
// op_id_ together. Compare this to startCoalescing/endCoalescing flow where
// we increment either seqP2P_ or seqCollective_ once per group and increment
// op_id_ once per indvidual operation within the group
// op_id_ once per individual operation within the group
op_id_++;
const auto key = getKeyFromDevice(device);
@ -4016,7 +4016,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> work;
if (coalescing_state_) {
// When coalescing, we record events per op that lack timing/state
// information becuase there is no 'work' associated with them, and then
// information because there is no 'work' associated with them, and then
// later in endCoalescing we record a 'coalesced' Work which has
// timing/state updates via watchdog thread, but lacks op metadata such as
// input/output sizes and profilingTitle per-op in the group.
@ -4397,7 +4397,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_coalesced(
std::make_tuple(
static_cast<int64_t>(seqCollective_) + 1,
false), // seq + 1 to match collective and assume only one collective
// in coalesed range
// in coalesced range
std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
tensors, // inputTensors
tensors, // outputTensors
@ -4694,7 +4694,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather(
// User-facing outputTensors should be held by the user until after
// waiting on work_, or the call makes no sense. We do a stashing here
// in case user doesn't hold the outputTensors in downstream code,
// which can cause an early recyle by the CachingAllocator, which can
// which can cause an early recycle by the CachingAllocator, which can
// lead to segfault or data corruption.
if (opts.asyncOp) {
work->stashed_for_allocator_safety_->stash(outputTensors_);
@ -4742,7 +4742,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather_into_tensor_coalesced(
std::make_tuple(
static_cast<int64_t>(seqCollective_) + 1,
false), // seq + 1 to match collective and assume only one collective
// in coalesed range
// in coalesced range
std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
inputs, // inputTensors
outputs, // outputTensors
@ -4956,7 +4956,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter_tensor_coalesced(
std::make_tuple(
static_cast<int64_t>(seqCollective_) + 1,
false), // seq + 1 to match collective and assume only one collective
// in coalesed range
// in coalesced range
std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
inputs, // inputTensors
outputs, // outputTensors

View File

@ -1291,7 +1291,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
// communication, the key will be "1:2" on both processes. Note: this is for
// the scenario where there is only 1 GPU per process. When it comes to
// multiple GPUs per process, this part may need to redesigned.
// TODO: we probably need a separte map for P2P comms
// TODO: we probably need a separate map for P2P comms
std::unordered_map<std::string, std::shared_ptr<NCCLComm>> devNCCLCommMap_;
// The NCCL communicators currently in process of being initialized.
@ -1316,7 +1316,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
std::atomic<bool> hasPendingHooks_{};
// This is the signal from watchdog threads to indicate whether the monitor
// thread should dump. Making it static so that it is accessiable from all the
// thread should dump. Making it static so that it is accessible from all the
// PGs. With this flag, monitor thread would dump debug info under any one of
// the three conditions:
//

View File

@ -397,7 +397,7 @@ class WriterPayload : public c10::intrusive_ptr_target {
void registeredInLoop() {
/*
This refcount increment must be matched by a reclaim call.
Call this method after sucessfully scheduling this handle with a loop.
Call this method after successfully scheduling this handle with a loop.
*/
at::raw::intrusive_ptr::incref(this);
}

View File

@ -573,9 +573,9 @@ using SizeType = uint64_t;
// (https://stackoverflow.com/a/20295079), and thus `errno` should really only
// be inspected if an error occurred.
//
// `success_cond` is an expression used to check if an error has happend. So for
// `fork()`, we can use `SYSCHECK(pid = fork(), pid != -1)`. The function output
// is stored in variable `__output` and may be used in `success_cond`.
// `success_cond` is an expression used to check if an error has happened. So
// for `fork()`, we can use `SYSCHECK(pid = fork(), pid != -1)`. The function
// output is stored in variable `__output` and may be used in `success_cond`.
#ifdef _WIN32
#define SYSCHECK(expr, success_cond) \
while (true) { \

View File

@ -118,7 +118,7 @@ class TORCH_API Work : public torch::CustomClassHolder {
// Get a Future object that would be marked as either success or failure
// This API can be used by the user to track the completion of the work
// and hanlde the exception if any.
// and handle the exception if any.
virtual c10::intrusive_ptr<c10::ivalue::Future> getFutureResult();
virtual float getDuration() const;

View File

@ -67,7 +67,8 @@ class TORCH_API GradBucket {
return parameters_;
}
// Returns whther this bucket is the last bucket to allreduce in an iteration.
// Returns whether this bucket is the last bucket to allreduce in an
// iteration.
bool isLast() const {
return index_ == bucket_count_ - 1;
}

View File

@ -5,7 +5,7 @@
#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
#include <c10/cuda/CUDAGuard.h>
// Two warninngs in Cutlass included header files
// Two warnings in Cutlass included header files
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
@ -163,7 +163,7 @@ at::Tensor async_input_mm_impl(
TORCH_CHECK(
M % num_chunks_M == 0,
"async_input_mm: `a.shape(0)` must be an interger multiple of `a_chunk_signals.numel()`");
"async_input_mm: `a.shape(0)` must be an integer multiple of `a_chunk_signals.numel()`");
size_t chunk_size_M = M / num_chunks_M;
size_t tile_size_M = cute::get<0>(TileShape_MNK{});
@ -248,7 +248,7 @@ at::Tensor async_input_mm_out(
});
#else
TORCH_CHECK(
false, "async_input_mm is not currenlty supported on your device");
false, "async_input_mm is not currently supported on your device");
#endif
return out;
}

View File

@ -3,7 +3,7 @@
* that supports consuming asynchronous input. This tile scheduler introduces the following arguments:
*
* - tiles_per_chunk_m Specifies the size of an M chunk. Chunks are the granularity at which the
* asynchronous input becomes ready. It must be an interger multiple of the size of an M tile.
* asynchronous input becomes ready. It must be an integer multiple of the size of an M tile.
*
* - chunk_signals chunk_signals[i] == 1 indicates that chunk i is ready. Before returning a work
* tile, get_current_work() waits for the signal to ensure that the corresponding chunk is ready.
@ -327,7 +327,7 @@ public:
wait_signal(scheduler_params.chunk_signals + chunk_idx);
}
// An arbirary, non-default id
// An arbitrary, non-default id
constexpr int barrier_id = 8;
arch::NamedBarrier barrier(NumThreadsPerWarp, barrier_id);
barrier.arrive_and_wait();

View File

@ -1006,7 +1006,7 @@ This class does not support ``__members__`` property.)");
});
#ifdef USE_NVSHMEM
// Intializes the device state in CUmodule so that its able to perform
// Initializes the device state in CUmodule so that its able to perform
// NVSHMEM operations.
module.def(
"_nvshmemx_cumodule_init",
@ -3297,7 +3297,7 @@ Arguments:
Default is False.
Attributes:
config (NCCLConfig): configures NCCL communicators (only avaiable for
config (NCCLConfig): configures NCCL communicators (only available for
builds using NCCL 2.17+). This can be used to improve
communication-computation overlap for NCCL kernels by tuning
available parameters in the config. See

View File

@ -10,7 +10,7 @@ namespace c10d {
// A struct to hold the latest status of the process group.
struct ProcessGroupStatus {
// the sequential number of the last collective enqueued into workMetaList_
// This is useful for indentifying a rank that has not join a collective
// This is useful for identifying a rank that has not join a collective
// initialized to be -1 to indicate no collective has been enqueued
int64_t lastEnqueuedSeq{-1};
// the sequential number of the last collective started as the kernel

View File

@ -15,7 +15,7 @@ class TORCH_PYTHON_API PythonCommHook : public CommHookInterface {
// The state is passed to the hook in runHook method, and it can be used to
// maintain and update any state information during the execution of the hook.
// The hook performs user-specified processing and returns a future indicating
// asychronous communication of gradients.
// asynchronous communication of gradients.
PythonCommHook(py::object state, py::object hook)
: state_(std::move(state)), hook_(std::move(hook)) {}

View File

@ -1245,7 +1245,7 @@ void Reducer::initialize_buckets(
// patterns when copy_ing grad data in and out of its bucket view.
// However, numerics remain correct, because the bucket view is the same
// on either end of the raw allreduce. bucket_view_in.copy(grad)
// tranposes
// transposes
// (+ densifies) to the bucket view's layout, the data is allreduced,
// then grad.copy_(bucket_view_out) transposes it back to grad's layout.
//

View File

@ -564,7 +564,7 @@ class TORCH_API Reducer {
// Retrieves parameter corresponding to the given VariableIndex.
at::Tensor& get_param_from_index(size_t index);
// Python reducer keeps C++ reducer initialized. To remove this flag,
// we need to refactor the DDP wrapper's initilization.
// we need to refactor the DDP wrapper's initialization.
bool use_python_reducer_;
// Cached bucket index to model parameter mapping. Populated after buckets

View File

@ -50,7 +50,7 @@ __device__ __forceinline__ void trap() {
#if defined(USE_ROCM)
// abort() calls trap() under the covers. However, on ROCm, the trap is
// handled differently inside hip runtime. It collects a gpu core dump and
// causes linux kernerl to create a core dump of the host application.
// causes linux kernel to create a core dump of the host application.
abort();
#else
__trap();

View File

@ -56,7 +56,7 @@
INT_SWITCH_CASE(k_alignment, 8, __VA_ARGS__); \
INT_SWITCH_CASE(k_alignment, 4, __VA_ARGS__); \
default: { \
TORCH_CHECK(false, "Not implemented for aligment=", alignment); \
TORCH_CHECK(false, "Not implemented for alignment=", alignment); \
} \
}

View File

@ -156,7 +156,7 @@ int IpcChannel::recv_fd() {
.msg_control = cbuf,
.msg_controllen = sizeof(cbuf)};
// Recieve message on socket_
// Receive message on socket_
TORCH_CHECK(
recvmsg(socket_, &msg, 0) > 0,
"Failed to receive fd: ",

View File

@ -62,7 +62,7 @@ static NvlMesh getNvlMesh(const std::vector<int>& rankToDeviceIdx) {
}
/**
* Detech topology given a NvlMesh.
* Detect topology given a NvlMesh.
*/
static Topology detectTopology(const NvlMesh nvlMesh, size_t worldSize) {
if (getCvarBool(TEST_INTRA_NODE_COMM, false)) {

View File

@ -101,7 +101,7 @@ void initialize_nvshmem_with_store(
LOG(INFO) << "NVSHMEM is available, version: " << major << "." << minor;
}
// Intializes the device state in CUmodule so that its able to perform NVSHMEM
// Initializes the device state in CUmodule so that its able to perform NVSHMEM
// operations.
void nvshmemx_cumodule_init(uintptr_t module) {
auto cumodule = reinterpret_cast<CUmodule>(module);
@ -546,7 +546,7 @@ at::Tensor nvshmem_all_to_all_vdev_2d(
| c0 | d0 | c1 | d1 | c2 | d2 | c3 | d3 |
where each `c_i` / `d_i` are slices of the `input` tensor, targeting
expert `i`, with length indicated by input splits (in
`in_out_splits[0]`). That is, the 2D AllToAllv shuffle achives a
`in_out_splits[0]`). That is, the 2D AllToAllv shuffle achieves a
transpose from rank-major order at input to expert-major order at
output.

View File

@ -14,7 +14,7 @@ void initialize_nvshmem_with_store(
// Check if NVSHMEM is available
TORCH_API bool is_nvshmem_available();
// Intializes the device state in CUmodule so that its able to perform NVSHMEM
// Initializes the device state in CUmodule so that its able to perform NVSHMEM
// operations.
TORCH_API void nvshmemx_cumodule_init(uintptr_t module);

View File

@ -24,7 +24,7 @@ TORCH_API std::unordered_map<std::string, worker_id_t> collectCurrentNames(
const worker_id_t selfId,
const std::string& selfName);
// Remove name frmo Store, used in dynamic RPC groups.
// Remove name from Store, used in dynamic RPC groups.
// NOTE: This needs to be called with the Dynamic RPC group
// membership management token held.
TORCH_API void removeCurrentName(

View File

@ -16,7 +16,7 @@ namespace torch::distributed::rpc {
namespace {
py::tuple toPyTuple(const RRefForkData& rrefForkData) {
// add GIL as it is contructing a py::object
// add GIL as it is constructing a py::object
pybind11::gil_scoped_acquire ag;
return py::make_tuple(
rrefForkData.ownerId_,

View File

@ -121,7 +121,7 @@ PythonRpcHandler& PythonRpcHandler::getInstance() {
// initialization by calling `new PythonRpcHandler()`, inside of which GIL is
// also required. Static data initialization is thread-safe, so the thread
// holding the GIL will wait for the other thread to finish static data
// initializating before going forward. Because the initialization can't
// initializing before going forward. Because the initialization can't
// proceed without GIL, there is a deadlock. We ask the calling thread to
// release GIL to avoid this situation.
TORCH_INTERNAL_ASSERT(!PyGILState_Check());

View File

@ -348,7 +348,7 @@ c10::intrusive_ptr<OwnerRRef> RRefContext::getOrCreateOwnerRRef(
// here is a plain TensorType, they are not equal relationship:
// specialized TensorType <: plain TensorType
//
// In RPC we don't care the difference as we ser/de with just the
// In RPC we don't care the difference as we ser'de with just the
// plain TensorType. This is not a issue for UserRRef creation either,
// since Tensor can only get specialized with a previous run of local
// JIT function, and we shouldn't preserve the specialized SubTensorType

View File

@ -318,7 +318,7 @@ class TORCH_API RRefContext {
// RRef is forwarded to the callee as new UserRRefs (if the callee is not
// the owner). In this case, we block running the user function until all
// UserRRefs are confirmed by the owner.
// This contract gurantees that no UserRRefs can be used remotely without
// This contract guarantees that no UserRRefs can be used remotely without
// confirmation. Note that, however, the UserRRef created by rpc.remote can
// still be passed to local functions as arguments and used there. This is by
// design, because this feature is especially useful when, say a master node

View File

@ -371,7 +371,7 @@ void TensorPipeAgent::checkAndSetStaticGroup(
isStaticGroupKey, std::vector<uint8_t>(), isStaticGroupVec);
std::string returnedVal = std::string(returnedVec.begin(), returnedVec.end());
// In both cases, the returned value should be the value of isStaticGroupStr,
// otherwise there is a discrepency with initialization among one of the
// otherwise there is a discrepancy with initialization among one of the
// members
TORCH_CHECK(
returnedVal == isStaticGroupStr,

View File

@ -121,8 +121,8 @@ struct TORCH_API TensorPipeRpcBackendOptions : public RpcBackendOptions {
deviceMaps[workerName] = deviceMap;
} else {
for (auto& entry : deviceMap) {
// c10::Device has no default constructor, hence map[device] dosn't work
// In C++-17 we can use insert_or_assign.
// c10::Device has no default constructor, hence map[device] doesn't
// work In C++-17 we can use insert_or_assign.
auto entryIter = iter->second.find(entry.first);
if (entryIter == iter->second.end()) {
iter->second.emplace(entry.first, entry.second);