mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[BE][11/16] fix typos in torch/ (torch/csrc/distributed/) (#156321)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/156321 Approved by: https://github.com/jingsh ghstack dependencies: #156313, #156314, #156315, #156316, #156317, #156319
This commit is contained in:
committed by
PyTorch MergeBot
parent
5b210bb3a6
commit
d55dc00f84
@ -1179,7 +1179,6 @@ exclude_patterns = [
|
||||
'torch/utils/**',
|
||||
'torch/csrc/jit/**',
|
||||
'torch/csrc/jit/[a-o]*/**',
|
||||
'torch/csrc/distributed/**',
|
||||
]
|
||||
init_command = [
|
||||
'python3',
|
||||
|
@ -24,5 +24,6 @@ rebuilt
|
||||
reenable
|
||||
reenabled
|
||||
requestor
|
||||
ser'de
|
||||
supercedes
|
||||
te
|
||||
|
@ -15,7 +15,7 @@ class BackwardPassCleanupGuard;
|
||||
|
||||
// This is a singleton class responsible for running distributed backward
|
||||
// passes. This engine relies heavily on the vanilla autograd engine and tries
|
||||
// to re-use it as much as possible. This class is mostly responsible for the
|
||||
// to reuse it as much as possible. This class is mostly responsible for the
|
||||
// distributed aspects of autograd and tries to hook into the autograd engine
|
||||
// where convenient.
|
||||
|
||||
|
@ -45,7 +45,7 @@ RpcWithProfilingReq::RpcWithProfilingReq(
|
||||
tensors_(std::move(tensors)),
|
||||
profilerConfig_(std::move(profilerConfig)),
|
||||
profilingKeyId_(profilingKeyId) {
|
||||
TORCH_INTERNAL_ASSERT(wrappedRpc_ != nullptr, "wrappedRpc cant be null");
|
||||
TORCH_INTERNAL_ASSERT(wrappedRpc_ != nullptr, "wrappedRpc can't be null");
|
||||
}
|
||||
|
||||
rpc::MessageType RpcWithProfilingReq::wrappedMessageType() const {
|
||||
|
@ -323,7 +323,7 @@ FileStore::~FileStore() {
|
||||
auto numFinishedWorker = addHelper(cleanupKey_, 1);
|
||||
auto refCount = addHelper(refCountKey_, -1);
|
||||
// The last worker cleans up the file. If numWorkers was not initialized to
|
||||
// a specific postive value (i.e. meaning that there was not a fixed number
|
||||
// a specific positive value (i.e. meaning that there was not a fixed number
|
||||
// of workers), we don't attempt to clean.
|
||||
// Clean up the file if number of references is 0.
|
||||
if (refCount == 0 && numWorkers_ >= 0 && numFinishedWorker >= numWorkers_) {
|
||||
|
@ -145,7 +145,7 @@ struct FlightRecorder {
|
||||
std::optional<c10::time_t> time_discovered_started_;
|
||||
|
||||
// timestamp when our CPU threads discovered that the kernel completed.
|
||||
// will always be _after_ it actually complated, and can be the same time
|
||||
// will always be _after_ it actually completed, and can be the same time
|
||||
// as the discovery of the start if the watchdog thread is stuck on CUDA
|
||||
// APIs
|
||||
std::optional<c10::time_t> time_discovered_completed_;
|
||||
|
@ -965,7 +965,7 @@ c10::intrusive_ptr<Work> ProcessGroupGloo::allreduce_sparse(
|
||||
const AllreduceOptions& opts) {
|
||||
// all reduce sparse calls into default allreduce which
|
||||
// implemented with all_gathering indices and values
|
||||
// we do ths we do not have a native cuda implementation
|
||||
// we do this we do not have a native cuda implementation
|
||||
return allreduce(inputs, opts);
|
||||
}
|
||||
|
||||
|
@ -65,7 +65,7 @@ struct WorkEntry {
|
||||
// That is, The process may be multi-threaded, and multiple threads may make
|
||||
// MPI calls, but only one at a time: MPI calls are not made concurrently from
|
||||
// two distinct threads (all MPI calls are serialized). However, with
|
||||
// MPI_THREAD_SERIALIZED, ProcessGroupMPI will only support a singe process
|
||||
// MPI_THREAD_SERIALIZED, ProcessGroupMPI will only support a single process
|
||||
// group. In other words, no more than 1 process group can be created globally.
|
||||
//
|
||||
// If you would like to use multiple ProcessGroupMPI, it requires your MPI
|
||||
|
@ -1423,7 +1423,7 @@ void ProcessGroupNCCL::abortCommsFromMap(
|
||||
bool ProcessGroupNCCL::abortComms(
|
||||
const std::optional<std::string>& abortReason) {
|
||||
// Remove record from global ncclCommMemPoolMapMutex before aboarting,
|
||||
// so that a new cache segment would not register to already aborded
|
||||
// so that a new cache segment would not register to already aborted
|
||||
// communicators. Note that ncclCommMemPoolMap is a global container which may
|
||||
// contain other PG's communicators, thus we need to only erase communicators
|
||||
// for the current PG.
|
||||
@ -1451,9 +1451,9 @@ void ProcessGroupNCCL::abort() {
|
||||
terminateProcessGroup_.store(true);
|
||||
watchdog_->notify();
|
||||
|
||||
// lauch abort asynchrounously and wait for it to complete or timeout
|
||||
// launch abort asynchronously and wait for it to complete or timeout
|
||||
LOG(INFO) << logPrefix()
|
||||
<< "Launching ProcessGroupNCCL abort asynchrounously.";
|
||||
<< "Launching ProcessGroupNCCL abort asynchronously.";
|
||||
std::future<bool> fut =
|
||||
std::async(std::launch::async, [this]() { return this->abortComms(); });
|
||||
|
||||
@ -1655,7 +1655,7 @@ std::string ProcessGroupNCCL::HeartbeatMonitor::getNCCLWatchdogTimeoutExitMsg(
|
||||
|
||||
void ProcessGroupNCCL::HeartbeatMonitor::setLastWorkListUpdateTime(
|
||||
std::chrono::time_point<std::chrono::steady_clock> time) {
|
||||
// We intentially let the race condition to happen but this is ok
|
||||
// We intentionally let the race condition to happen but this is ok
|
||||
// as long as we update the time, we know we are making progress.
|
||||
lastWorkListUpdateTime_ = time;
|
||||
}
|
||||
@ -1761,7 +1761,7 @@ void ProcessGroupNCCL::HeartbeatMonitor::runLoop() {
|
||||
// 1. The current rank is the first to observe a timeout in watchdog.
|
||||
// (shouldDump_ was set to true by the watchdog thread).
|
||||
// 2. Other ranks detected the timeout and signal the current rank to
|
||||
// dump. In addtion, monitor threads will dump if watchdog threads has no
|
||||
// dump. In addition, monitor threads will dump if watchdog threads has no
|
||||
// heartbeat or dumpPipe is not empty.
|
||||
if (shouldDump_.load()) {
|
||||
errorMsg = getNCCLWatchdogTimeoutErrorMsg("this local rank");
|
||||
@ -3030,7 +3030,7 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::initNCCLComm(
|
||||
|
||||
bool useScalableInit = false;
|
||||
// (nranks / nroots) == 128 was the default NCCL recommended
|
||||
// accoring to
|
||||
// according to
|
||||
// https://github.com/pytorch/pytorch/pull/136789#discussion_r1779171615.
|
||||
auto ranksPerRoot = getCvarInt(TORCH_NCCL_RANKS_PER_ROOT, 128);
|
||||
#if defined(NCCL_HAS_INIT_RANK_SCALABLE) && defined(NCCL_HAS_CONFIG)
|
||||
@ -3327,7 +3327,7 @@ c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> ProcessGroupNCCL::initWork(
|
||||
// - initially, moved record() into workEnqueue(), but found that makes it
|
||||
// hard to get access to profilingTitle,
|
||||
// inputs, and outputs for metadata recording, and we don't want to attach
|
||||
// these objects to the Work becuase it has implications for keeping those
|
||||
// these objects to the Work because it has implications for keeping those
|
||||
// tensors alive longer and adds overhead when copying Work objects
|
||||
// between threads
|
||||
r->trace_id_ = FlightRecorderCUDA::get()->record(
|
||||
@ -3442,7 +3442,7 @@ void ProcessGroupNCCL::startCoalescing() {
|
||||
// ops from a coalesce group into the flight recorder, we want to have the
|
||||
// same seq_ for those ops and its 'endCoalescing' op. Hence we bump during
|
||||
// start, which has one minor downside- we burn a seq_ if someone ever does a
|
||||
// 'start' and 'end' coalescing region without doing an operation inbetween.
|
||||
// 'start' and 'end' coalescing region without doing an operation in between.
|
||||
|
||||
coalescedDevice_.set_index(-1);
|
||||
coalescedComm_ = nullptr;
|
||||
@ -3462,7 +3462,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::endCoalescing(OpType optype) {
|
||||
}
|
||||
TORCH_CHECK(
|
||||
coalescedDevice_.index() >= 0,
|
||||
"Somthing went wrong. Did you call end_coalescing before start_coalescing?");
|
||||
"Something went wrong. Did you call end_coalescing before start_coalescing?");
|
||||
|
||||
// `coalescedComm_` should have same set of comms across collectives
|
||||
auto comm = coalescedComm_;
|
||||
@ -3618,7 +3618,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
|
||||
device, rank_, opType, false, profilingTitle, inputs, outputs, enqueue);
|
||||
if (coalescing_state_) {
|
||||
// When coalescing, we record events per op that lack timing/state
|
||||
// information becuase there is no 'work' associated with them, and then
|
||||
// information because there is no 'work' associated with them, and then
|
||||
// later in endCoalescing we record a 'coalesced' Work which has
|
||||
// timing/state updates via watchdog thread, but lacks op metadata such as
|
||||
// input/output sizes and profilingTitle per-op in the group.
|
||||
@ -3781,7 +3781,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collectiveCoalesced(
|
||||
// collective so there is no flight record and we increment seqCollective_ and
|
||||
// op_id_ together. Compare this to startCoalescing/endCoalescing flow where
|
||||
// we increment either seqP2P_ or seqCollective_ once per group and increment
|
||||
// op_id_ once per indvidual operation within the group
|
||||
// op_id_ once per individual operation within the group
|
||||
op_id_++;
|
||||
|
||||
const auto key = getKeyFromDevice(device);
|
||||
@ -4016,7 +4016,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
|
||||
c10::intrusive_ptr<ProcessGroupNCCL::WorkNCCL> work;
|
||||
if (coalescing_state_) {
|
||||
// When coalescing, we record events per op that lack timing/state
|
||||
// information becuase there is no 'work' associated with them, and then
|
||||
// information because there is no 'work' associated with them, and then
|
||||
// later in endCoalescing we record a 'coalesced' Work which has
|
||||
// timing/state updates via watchdog thread, but lacks op metadata such as
|
||||
// input/output sizes and profilingTitle per-op in the group.
|
||||
@ -4397,7 +4397,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_coalesced(
|
||||
std::make_tuple(
|
||||
static_cast<int64_t>(seqCollective_) + 1,
|
||||
false), // seq + 1 to match collective and assume only one collective
|
||||
// in coalesed range
|
||||
// in coalesced range
|
||||
std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
|
||||
tensors, // inputTensors
|
||||
tensors, // outputTensors
|
||||
@ -4694,7 +4694,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather(
|
||||
// User-facing outputTensors should be held by the user until after
|
||||
// waiting on work_, or the call makes no sense. We do a stashing here
|
||||
// in case user doesn't hold the outputTensors in downstream code,
|
||||
// which can cause an early recyle by the CachingAllocator, which can
|
||||
// which can cause an early recycle by the CachingAllocator, which can
|
||||
// lead to segfault or data corruption.
|
||||
if (opts.asyncOp) {
|
||||
work->stashed_for_allocator_safety_->stash(outputTensors_);
|
||||
@ -4742,7 +4742,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allgather_into_tensor_coalesced(
|
||||
std::make_tuple(
|
||||
static_cast<int64_t>(seqCollective_) + 1,
|
||||
false), // seq + 1 to match collective and assume only one collective
|
||||
// in coalesed range
|
||||
// in coalesced range
|
||||
std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
|
||||
inputs, // inputTensors
|
||||
outputs, // outputTensors
|
||||
@ -4956,7 +4956,7 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter_tensor_coalesced(
|
||||
std::make_tuple(
|
||||
static_cast<int64_t>(seqCollective_) + 1,
|
||||
false), // seq + 1 to match collective and assume only one collective
|
||||
// in coalesed range
|
||||
// in coalesced range
|
||||
std::make_tuple(pg_uid_, pg_desc_), // PG name tuple
|
||||
inputs, // inputTensors
|
||||
outputs, // outputTensors
|
||||
|
@ -1291,7 +1291,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
|
||||
// communication, the key will be "1:2" on both processes. Note: this is for
|
||||
// the scenario where there is only 1 GPU per process. When it comes to
|
||||
// multiple GPUs per process, this part may need to redesigned.
|
||||
// TODO: we probably need a separte map for P2P comms
|
||||
// TODO: we probably need a separate map for P2P comms
|
||||
std::unordered_map<std::string, std::shared_ptr<NCCLComm>> devNCCLCommMap_;
|
||||
|
||||
// The NCCL communicators currently in process of being initialized.
|
||||
@ -1316,7 +1316,7 @@ class TORCH_API ProcessGroupNCCL : public Backend {
|
||||
std::atomic<bool> hasPendingHooks_{};
|
||||
|
||||
// This is the signal from watchdog threads to indicate whether the monitor
|
||||
// thread should dump. Making it static so that it is accessiable from all the
|
||||
// thread should dump. Making it static so that it is accessible from all the
|
||||
// PGs. With this flag, monitor thread would dump debug info under any one of
|
||||
// the three conditions:
|
||||
//
|
||||
|
@ -397,7 +397,7 @@ class WriterPayload : public c10::intrusive_ptr_target {
|
||||
void registeredInLoop() {
|
||||
/*
|
||||
This refcount increment must be matched by a reclaim call.
|
||||
Call this method after sucessfully scheduling this handle with a loop.
|
||||
Call this method after successfully scheduling this handle with a loop.
|
||||
*/
|
||||
at::raw::intrusive_ptr::incref(this);
|
||||
}
|
||||
|
@ -573,9 +573,9 @@ using SizeType = uint64_t;
|
||||
// (https://stackoverflow.com/a/20295079), and thus `errno` should really only
|
||||
// be inspected if an error occurred.
|
||||
//
|
||||
// `success_cond` is an expression used to check if an error has happend. So for
|
||||
// `fork()`, we can use `SYSCHECK(pid = fork(), pid != -1)`. The function output
|
||||
// is stored in variable `__output` and may be used in `success_cond`.
|
||||
// `success_cond` is an expression used to check if an error has happened. So
|
||||
// for `fork()`, we can use `SYSCHECK(pid = fork(), pid != -1)`. The function
|
||||
// output is stored in variable `__output` and may be used in `success_cond`.
|
||||
#ifdef _WIN32
|
||||
#define SYSCHECK(expr, success_cond) \
|
||||
while (true) { \
|
||||
|
@ -118,7 +118,7 @@ class TORCH_API Work : public torch::CustomClassHolder {
|
||||
|
||||
// Get a Future object that would be marked as either success or failure
|
||||
// This API can be used by the user to track the completion of the work
|
||||
// and hanlde the exception if any.
|
||||
// and handle the exception if any.
|
||||
virtual c10::intrusive_ptr<c10::ivalue::Future> getFutureResult();
|
||||
|
||||
virtual float getDuration() const;
|
||||
|
@ -67,7 +67,8 @@ class TORCH_API GradBucket {
|
||||
return parameters_;
|
||||
}
|
||||
|
||||
// Returns whther this bucket is the last bucket to allreduce in an iteration.
|
||||
// Returns whether this bucket is the last bucket to allreduce in an
|
||||
// iteration.
|
||||
bool isLast() const {
|
||||
return index_ == bucket_count_ - 1;
|
||||
}
|
||||
|
@ -5,7 +5,7 @@
|
||||
#include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
|
||||
// Two warninngs in Cutlass included header files
|
||||
// Two warnings in Cutlass included header files
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wset-but-not-used")
|
||||
C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wunused-but-set-parameter")
|
||||
|
||||
@ -163,7 +163,7 @@ at::Tensor async_input_mm_impl(
|
||||
|
||||
TORCH_CHECK(
|
||||
M % num_chunks_M == 0,
|
||||
"async_input_mm: `a.shape(0)` must be an interger multiple of `a_chunk_signals.numel()`");
|
||||
"async_input_mm: `a.shape(0)` must be an integer multiple of `a_chunk_signals.numel()`");
|
||||
size_t chunk_size_M = M / num_chunks_M;
|
||||
size_t tile_size_M = cute::get<0>(TileShape_MNK{});
|
||||
|
||||
@ -248,7 +248,7 @@ at::Tensor async_input_mm_out(
|
||||
});
|
||||
#else
|
||||
TORCH_CHECK(
|
||||
false, "async_input_mm is not currenlty supported on your device");
|
||||
false, "async_input_mm is not currently supported on your device");
|
||||
#endif
|
||||
return out;
|
||||
}
|
||||
|
@ -3,7 +3,7 @@
|
||||
* that supports consuming asynchronous input. This tile scheduler introduces the following arguments:
|
||||
*
|
||||
* - tiles_per_chunk_m – Specifies the size of an M chunk. Chunks are the granularity at which the
|
||||
* asynchronous input becomes ready. It must be an interger multiple of the size of an M tile.
|
||||
* asynchronous input becomes ready. It must be an integer multiple of the size of an M tile.
|
||||
*
|
||||
* - chunk_signals – chunk_signals[i] == 1 indicates that chunk i is ready. Before returning a work
|
||||
* tile, get_current_work() waits for the signal to ensure that the corresponding chunk is ready.
|
||||
@ -327,7 +327,7 @@ public:
|
||||
wait_signal(scheduler_params.chunk_signals + chunk_idx);
|
||||
}
|
||||
|
||||
// An arbirary, non-default id
|
||||
// An arbitrary, non-default id
|
||||
constexpr int barrier_id = 8;
|
||||
arch::NamedBarrier barrier(NumThreadsPerWarp, barrier_id);
|
||||
barrier.arrive_and_wait();
|
||||
|
@ -1006,7 +1006,7 @@ This class does not support ``__members__`` property.)");
|
||||
});
|
||||
|
||||
#ifdef USE_NVSHMEM
|
||||
// Intializes the device state in CUmodule so that it’s able to perform
|
||||
// Initializes the device state in CUmodule so that it’s able to perform
|
||||
// NVSHMEM operations.
|
||||
module.def(
|
||||
"_nvshmemx_cumodule_init",
|
||||
@ -3297,7 +3297,7 @@ Arguments:
|
||||
Default is False.
|
||||
|
||||
Attributes:
|
||||
config (NCCLConfig): configures NCCL communicators (only avaiable for
|
||||
config (NCCLConfig): configures NCCL communicators (only available for
|
||||
builds using NCCL 2.17+). This can be used to improve
|
||||
communication-computation overlap for NCCL kernels by tuning
|
||||
available parameters in the config. See
|
||||
|
@ -10,7 +10,7 @@ namespace c10d {
|
||||
// A struct to hold the latest status of the process group.
|
||||
struct ProcessGroupStatus {
|
||||
// the sequential number of the last collective enqueued into workMetaList_
|
||||
// This is useful for indentifying a rank that has not join a collective
|
||||
// This is useful for identifying a rank that has not join a collective
|
||||
// initialized to be -1 to indicate no collective has been enqueued
|
||||
int64_t lastEnqueuedSeq{-1};
|
||||
// the sequential number of the last collective started as the kernel
|
||||
|
@ -15,7 +15,7 @@ class TORCH_PYTHON_API PythonCommHook : public CommHookInterface {
|
||||
// The state is passed to the hook in runHook method, and it can be used to
|
||||
// maintain and update any state information during the execution of the hook.
|
||||
// The hook performs user-specified processing and returns a future indicating
|
||||
// asychronous communication of gradients.
|
||||
// asynchronous communication of gradients.
|
||||
PythonCommHook(py::object state, py::object hook)
|
||||
: state_(std::move(state)), hook_(std::move(hook)) {}
|
||||
|
||||
|
@ -1245,7 +1245,7 @@ void Reducer::initialize_buckets(
|
||||
// patterns when copy_ing grad data in and out of its bucket view.
|
||||
// However, numerics remain correct, because the bucket view is the same
|
||||
// on either end of the raw allreduce. bucket_view_in.copy(grad)
|
||||
// tranposes
|
||||
// transposes
|
||||
// (+ densifies) to the bucket view's layout, the data is allreduced,
|
||||
// then grad.copy_(bucket_view_out) transposes it back to grad's layout.
|
||||
//
|
||||
|
@ -564,7 +564,7 @@ class TORCH_API Reducer {
|
||||
// Retrieves parameter corresponding to the given VariableIndex.
|
||||
at::Tensor& get_param_from_index(size_t index);
|
||||
// Python reducer keeps C++ reducer initialized. To remove this flag,
|
||||
// we need to refactor the DDP wrapper's initilization.
|
||||
// we need to refactor the DDP wrapper's initialization.
|
||||
bool use_python_reducer_;
|
||||
|
||||
// Cached bucket index to model parameter mapping. Populated after buckets
|
||||
|
@ -50,7 +50,7 @@ __device__ __forceinline__ void trap() {
|
||||
#if defined(USE_ROCM)
|
||||
// abort() calls trap() under the covers. However, on ROCm, the trap is
|
||||
// handled differently inside hip runtime. It collects a gpu core dump and
|
||||
// causes linux kernerl to create a core dump of the host application.
|
||||
// causes linux kernel to create a core dump of the host application.
|
||||
abort();
|
||||
#else
|
||||
__trap();
|
||||
|
@ -56,7 +56,7 @@
|
||||
INT_SWITCH_CASE(k_alignment, 8, __VA_ARGS__); \
|
||||
INT_SWITCH_CASE(k_alignment, 4, __VA_ARGS__); \
|
||||
default: { \
|
||||
TORCH_CHECK(false, "Not implemented for aligment=", alignment); \
|
||||
TORCH_CHECK(false, "Not implemented for alignment=", alignment); \
|
||||
} \
|
||||
}
|
||||
|
||||
|
@ -156,7 +156,7 @@ int IpcChannel::recv_fd() {
|
||||
.msg_control = cbuf,
|
||||
.msg_controllen = sizeof(cbuf)};
|
||||
|
||||
// Recieve message on socket_
|
||||
// Receive message on socket_
|
||||
TORCH_CHECK(
|
||||
recvmsg(socket_, &msg, 0) > 0,
|
||||
"Failed to receive fd: ",
|
||||
|
@ -62,7 +62,7 @@ static NvlMesh getNvlMesh(const std::vector<int>& rankToDeviceIdx) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Detech topology given a NvlMesh.
|
||||
* Detect topology given a NvlMesh.
|
||||
*/
|
||||
static Topology detectTopology(const NvlMesh nvlMesh, size_t worldSize) {
|
||||
if (getCvarBool(TEST_INTRA_NODE_COMM, false)) {
|
||||
|
@ -101,7 +101,7 @@ void initialize_nvshmem_with_store(
|
||||
LOG(INFO) << "NVSHMEM is available, version: " << major << "." << minor;
|
||||
}
|
||||
|
||||
// Intializes the device state in CUmodule so that it’s able to perform NVSHMEM
|
||||
// Initializes the device state in CUmodule so that it’s able to perform NVSHMEM
|
||||
// operations.
|
||||
void nvshmemx_cumodule_init(uintptr_t module) {
|
||||
auto cumodule = reinterpret_cast<CUmodule>(module);
|
||||
@ -546,7 +546,7 @@ at::Tensor nvshmem_all_to_all_vdev_2d(
|
||||
| c0 | d0 | c1 | d1 | c2 | d2 | c3 | d3 |
|
||||
where each `c_i` / `d_i` are slices of the `input` tensor, targeting
|
||||
expert `i`, with length indicated by input splits (in
|
||||
`in_out_splits[0]`). That is, the 2D AllToAllv shuffle achives a
|
||||
`in_out_splits[0]`). That is, the 2D AllToAllv shuffle achieves a
|
||||
transpose from rank-major order at input to expert-major order at
|
||||
output.
|
||||
|
||||
|
@ -14,7 +14,7 @@ void initialize_nvshmem_with_store(
|
||||
// Check if NVSHMEM is available
|
||||
TORCH_API bool is_nvshmem_available();
|
||||
|
||||
// Intializes the device state in CUmodule so that it’s able to perform NVSHMEM
|
||||
// Initializes the device state in CUmodule so that it’s able to perform NVSHMEM
|
||||
// operations.
|
||||
TORCH_API void nvshmemx_cumodule_init(uintptr_t module);
|
||||
|
||||
|
@ -24,7 +24,7 @@ TORCH_API std::unordered_map<std::string, worker_id_t> collectCurrentNames(
|
||||
const worker_id_t selfId,
|
||||
const std::string& selfName);
|
||||
|
||||
// Remove name frmo Store, used in dynamic RPC groups.
|
||||
// Remove name from Store, used in dynamic RPC groups.
|
||||
// NOTE: This needs to be called with the Dynamic RPC group
|
||||
// membership management token held.
|
||||
TORCH_API void removeCurrentName(
|
||||
|
@ -16,7 +16,7 @@ namespace torch::distributed::rpc {
|
||||
namespace {
|
||||
|
||||
py::tuple toPyTuple(const RRefForkData& rrefForkData) {
|
||||
// add GIL as it is contructing a py::object
|
||||
// add GIL as it is constructing a py::object
|
||||
pybind11::gil_scoped_acquire ag;
|
||||
return py::make_tuple(
|
||||
rrefForkData.ownerId_,
|
||||
|
@ -121,7 +121,7 @@ PythonRpcHandler& PythonRpcHandler::getInstance() {
|
||||
// initialization by calling `new PythonRpcHandler()`, inside of which GIL is
|
||||
// also required. Static data initialization is thread-safe, so the thread
|
||||
// holding the GIL will wait for the other thread to finish static data
|
||||
// initializating before going forward. Because the initialization can't
|
||||
// initializing before going forward. Because the initialization can't
|
||||
// proceed without GIL, there is a deadlock. We ask the calling thread to
|
||||
// release GIL to avoid this situation.
|
||||
TORCH_INTERNAL_ASSERT(!PyGILState_Check());
|
||||
|
@ -348,7 +348,7 @@ c10::intrusive_ptr<OwnerRRef> RRefContext::getOrCreateOwnerRRef(
|
||||
// here is a plain TensorType, they are not equal relationship:
|
||||
// specialized TensorType <: plain TensorType
|
||||
//
|
||||
// In RPC we don't care the difference as we ser/de with just the
|
||||
// In RPC we don't care the difference as we ser'de with just the
|
||||
// plain TensorType. This is not a issue for UserRRef creation either,
|
||||
// since Tensor can only get specialized with a previous run of local
|
||||
// JIT function, and we shouldn't preserve the specialized SubTensorType
|
||||
|
@ -318,7 +318,7 @@ class TORCH_API RRefContext {
|
||||
// RRef is forwarded to the callee as new UserRRefs (if the callee is not
|
||||
// the owner). In this case, we block running the user function until all
|
||||
// UserRRefs are confirmed by the owner.
|
||||
// This contract gurantees that no UserRRefs can be used remotely without
|
||||
// This contract guarantees that no UserRRefs can be used remotely without
|
||||
// confirmation. Note that, however, the UserRRef created by rpc.remote can
|
||||
// still be passed to local functions as arguments and used there. This is by
|
||||
// design, because this feature is especially useful when, say a master node
|
||||
|
@ -371,7 +371,7 @@ void TensorPipeAgent::checkAndSetStaticGroup(
|
||||
isStaticGroupKey, std::vector<uint8_t>(), isStaticGroupVec);
|
||||
std::string returnedVal = std::string(returnedVec.begin(), returnedVec.end());
|
||||
// In both cases, the returned value should be the value of isStaticGroupStr,
|
||||
// otherwise there is a discrepency with initialization among one of the
|
||||
// otherwise there is a discrepancy with initialization among one of the
|
||||
// members
|
||||
TORCH_CHECK(
|
||||
returnedVal == isStaticGroupStr,
|
||||
|
@ -121,8 +121,8 @@ struct TORCH_API TensorPipeRpcBackendOptions : public RpcBackendOptions {
|
||||
deviceMaps[workerName] = deviceMap;
|
||||
} else {
|
||||
for (auto& entry : deviceMap) {
|
||||
// c10::Device has no default constructor, hence map[device] dosn't work
|
||||
// In C++-17 we can use insert_or_assign.
|
||||
// c10::Device has no default constructor, hence map[device] doesn't
|
||||
// work In C++-17 we can use insert_or_assign.
|
||||
auto entryIter = iter->second.find(entry.first);
|
||||
if (entryIter == iter->second.end()) {
|
||||
iter->second.emplace(entry.first, entry.second);
|
||||
|
Reference in New Issue
Block a user