mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 05:34:18 +08:00
[BE][5/16] fix typos in torch/ (torch/distributed/) (#156315)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/156315 Approved by: https://github.com/Skylion007, https://github.com/albanD ghstack dependencies: #156313, #156314
This commit is contained in:
committed by
PyTorch MergeBot
parent
ead741c5fb
commit
c2f0292bd5
@ -547,7 +547,7 @@ class _CollOp:
|
||||
Args:
|
||||
op (Callable): A collective function, e.g. ``torch.distributed.all_reduce``.
|
||||
tensor (Tensor): Tensor to operate on.
|
||||
dst_tensor (Tensor, optional): Provided when source and destinaton tensors are not the same.
|
||||
dst_tensor (Tensor, optional): Provided when source and destination tensors are not the same.
|
||||
redop (ReduceOp, optional): reduce operation.
|
||||
root (int, optional): root of broadcast or reduce.
|
||||
"""
|
||||
@ -1610,7 +1610,7 @@ def init_process_group(
|
||||
options we support is ``ProcessGroupNCCL.Options`` for the ``nccl``
|
||||
backend, ``is_high_priority_stream`` can be specified so that
|
||||
the nccl backend can pick up high priority cuda streams when
|
||||
there're compute kernels waiting. For other availble options to config nccl,
|
||||
there're compute kernels waiting. For other available options to config nccl,
|
||||
See https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/types.html#ncclconfig-t
|
||||
device_id (torch.device | int, optional): a single, specific device
|
||||
this process will work on, allowing for backend-specific
|
||||
@ -2692,7 +2692,7 @@ def _time_estimator(
|
||||
backend = group._get_backend(device)
|
||||
if not backend.supports_time_estimate:
|
||||
raise NotImplementedError(
|
||||
f"collective time estimator is not supported in the curent version of backend {backend}"
|
||||
f"collective time estimator is not supported in the current version of backend {backend}"
|
||||
)
|
||||
backend._start_time_estimate() # type: ignore[attr-defined]
|
||||
cm = _TimeEstimator()
|
||||
@ -3058,7 +3058,7 @@ def _object_to_tensor(obj, device, group):
|
||||
_pickler(f).dump(obj)
|
||||
byte_storage = torch.ByteStorage._from_buffer(f.getvalue()) # type: ignore[attr-defined]
|
||||
# Do not replace `torch.ByteTensor` or `torch.LongTensor` with torch.tensor and specifying dtype.
|
||||
# Otherwise, it will casue 100X slowdown.
|
||||
# Otherwise, it will cause 100X slowdown.
|
||||
# See: https://github.com/pytorch/pytorch/issues/65696
|
||||
byte_tensor = torch.ByteTensor(byte_storage).to(device)
|
||||
if get_debug_level() == DebugLevel.DETAIL and is_nccl_available():
|
||||
@ -3116,7 +3116,7 @@ def all_gather_object(object_list, obj, group=None):
|
||||
.. note:: For NCCL-based processed groups, internal tensor representations
|
||||
of objects must be moved to the GPU device before communication takes
|
||||
place. In this case, the device used is given by
|
||||
``torch.cuda.current_device()`` and it is the user's responsiblity to
|
||||
``torch.cuda.current_device()`` and it is the user's responsibility to
|
||||
ensure that this is set so that each rank has an individual GPU, via
|
||||
``torch.cuda.set_device()``.
|
||||
|
||||
@ -3220,7 +3220,7 @@ def gather_object(
|
||||
.. note:: For NCCL-based processed groups, internal tensor representations
|
||||
of objects must be moved to the GPU device before communication takes
|
||||
place. In this case, the device used is given by
|
||||
``torch.cuda.current_device()`` and it is the user's responsiblity to
|
||||
``torch.cuda.current_device()`` and it is the user's responsibility to
|
||||
ensure that this is set so that each rank has an individual GPU, via
|
||||
``torch.cuda.set_device()``.
|
||||
|
||||
@ -4881,7 +4881,7 @@ def monitored_barrier(
|
||||
if timeout is None:
|
||||
timeout = _get_default_timeout(get_backend(group))
|
||||
elif isinstance(timeout, float):
|
||||
# TODO(whc) aparently some existing test case for monitored_barrier passes in a timeout in float format?
|
||||
# TODO(whc) apparently some existing test case for monitored_barrier passes in a timeout in float format?
|
||||
warnings.warn(
|
||||
"Please specify timeout arg as a timedelta. "
|
||||
f"Converting current value of {timeout} assuming it represents seconds",
|
||||
@ -4979,16 +4979,16 @@ def split_group(
|
||||
group_desc: Optional[str] = None,
|
||||
) -> Optional[ProcessGroup]:
|
||||
"""
|
||||
Create a new process group splitted from the given parent process group.
|
||||
Create a new process group split from the given parent process group.
|
||||
|
||||
warning:: This is an experimental API. Only the ``NCCL`` and custom plugin backends
|
||||
are supported. Other backends will raise an error.
|
||||
Users of this API must gurantee that all ranks in the parent group enter this API call,
|
||||
Users of this API must guarantee that all ranks in the parent group enter this API call,
|
||||
and the split of the sub groups is the same across all ranks in the parent group.
|
||||
|
||||
Args:
|
||||
parent_pg (ProcessGroup, optional): The parent process group. If None,
|
||||
the default process group will be used. Users need to gurantee that
|
||||
the default process group will be used. Users need to guarantee that
|
||||
the parent group is fully initialized (e.g, communicators are initialized)
|
||||
split_ranks (list[list[int]]): the split ranks, which is a list of list of ranks.
|
||||
Users need to make sure the validity of the split ranks such that one
|
||||
@ -5225,7 +5225,7 @@ def new_group(
|
||||
specifying what additional options need to be passed in during
|
||||
the construction of specific process groups. i.e. for the ``nccl``
|
||||
backend, ``is_high_priority_stream`` can be specified so that
|
||||
process group can pick up high priority cuda streams. For other availble options to config nccl,
|
||||
process group can pick up high priority cuda streams. For other available options to config nccl,
|
||||
See https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/types.html#ncclconfig-tuse_local_synchronization
|
||||
(bool, optional): perform a group-local barrier at the end of the process group creation.
|
||||
This is different in that non-member ranks don't need to call into API and don't
|
||||
@ -5246,7 +5246,7 @@ def new_group(
|
||||
as non-member ranks don't join the group barrier().
|
||||
|
||||
N.B. use_local_synchronization=True can lead to deadlocks when each rank creates
|
||||
multiple overlaping process groups. To avoid that, make sure all ranks follow the
|
||||
multiple overlapping process groups. To avoid that, make sure all ranks follow the
|
||||
same global creation order.
|
||||
"""
|
||||
return _new_group_with_tag(
|
||||
|
Reference in New Issue
Block a user