Compare commits

...

2 Commits

Author SHA1 Message Date
d7b7cac4eb [DCP] Returns a copy of sd in copy sd (#123567)
I found that returning the copy is actually useful in situations where you might do something like:

```
ret = _copy_state_dict(obj, cache)
ret.update(some_other_values)
```

and would like `cache` not to change structure from `ret.update(some_other_values)`.  Open to some notes here, not returning a copy might force the user to do some additional copies for this case.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/123567
Approved by: https://github.com/wz337
2024-04-18 13:20:20 -07:00
b86edd97d6 [nccl-pg] print broadcast ncclunique id duration (#123963)
Summary: Print NCCL PG broadcast nccl unique id duration for measurement.

Differential Revision: D56048059

Pull Request resolved: https://github.com/pytorch/pytorch/pull/123963
Approved by: https://github.com/wconstab
2024-04-16 17:03:25 -07:00
2 changed files with 23 additions and 4 deletions

View File

@ -1976,7 +1976,16 @@ std::vector<std::shared_ptr<NCCLComm>>& ProcessGroupNCCL::getNCCLComm(
// For point-to-point communication on the same process, don't need broadcast.
if (!isSendRecvSelf) {
// Broadcast so that each process can have a unique NCCL ID
auto timeStarted = std::chrono::steady_clock::now();
broadcastUniqueNCCLID(&ncclID, singleP2POp, devicesKey, p2pRank);
auto timerDeltaMs =
std::chrono::duration_cast<std::chrono::duration<double>>(
std::chrono::steady_clock::now() - timeStarted)
.count() *
1000;
LOG(INFO) << logPrefix()
<< "ProcessGroupNCCL broadcast unique ID through store took "
<< timerDeltaMs << " ms";
}
at::cuda::OptionalCUDAGuard gpuGuard;

View File

@ -117,7 +117,12 @@ def _iterate_state_dict(
not isinstance(companion_obj, dict)
or set(companion_obj.keys()) != set(iter_object.keys())
):
raise CompanionMismatch()
msg = (
""
if isinstance(companion_obj, dict)
else f"{set(companion_obj.keys())=} {set(iter_object.keys())=}"
)
raise CompanionMismatch(msg)
ret = {
key: _iterate_state_dict(
@ -312,10 +317,12 @@ def _copy_state_dict(
state_dict: Dict[str, Any],
copy_state_dict: Dict[str, Any],
non_blocking: bool = False,
):
) -> Dict[str, Any]:
"""
Copies all tensors in a given state dict into a different state_dict with the
same structure.
same structure. Additionally, a copied state dict with the same value references
is returned. Editing the keys on this state dict will not affect the
passed in copy_state_dict (but the value references are the same).
.. warning::
It is expected by this function that state_dict and copy_state_dict share
@ -331,9 +338,12 @@ def _copy_state_dict(
The state dict we are copying into. This state_dict must have exactly
the same structure as the source `state_dict`.
non_blocking: (bool): Whether copy ops should be performed asynchronously
Returns:
State Dict copy
"""
_iterate_state_dict(
return _iterate_state_dict(
state_dict,
_identity_func,
_identity_func,