mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-11 22:34:53 +08:00
[DCP] Adds support for non-primatives in async_save by deep copying during cpu offloading (#123941)
Adds support for non-primatives in async_save by deep copying during cpu offloading. If users are not type checking, the expectation in async is likely that the object is copied Differential Revision: [D56065237](https://our.internmc.facebook.com/intern/diff/D56065237/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/123941 Approved by: https://github.com/fegin
This commit is contained in:
committed by
PyTorch MergeBot
parent
14b2273b0c
commit
46a25cc0db
@ -1,3 +1,4 @@
|
||||
import copy
|
||||
import io
|
||||
import math
|
||||
from typing import Any, Callable, Dict, Optional, Tuple, TYPE_CHECKING
|
||||
@ -166,7 +167,7 @@ def _iterate_state_dict(
|
||||
if isinstance(iter_object, tuple):
|
||||
ret = tuple(ret)
|
||||
elif not type_check:
|
||||
ret = iter_object
|
||||
ret = copy.deepcopy(iter_object)
|
||||
else:
|
||||
raise ValueError(f"Unexpected value type {type(iter_object)}")
|
||||
|
||||
|
||||
@ -216,7 +216,9 @@ def async_save(
|
||||
torch.device("cpu") in pg._device_types # type: ignore[attr-defined]
|
||||
), "A CPU backend must be enabled for async save; try initializing process group with 'cpu:gloo,cuda:nccl'"
|
||||
|
||||
cpu_state_dict = _offload_state_dict_to_cpu(_stateful_to_state_dict(state_dict))
|
||||
cpu_state_dict = _offload_state_dict_to_cpu(
|
||||
_stateful_to_state_dict(state_dict), type_check=False
|
||||
)
|
||||
|
||||
executor = ThreadPoolExecutor(max_workers=1)
|
||||
f: Future = executor.submit(
|
||||
|
||||
Reference in New Issue
Block a user