mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 05:34:18 +08:00
[torchelastic] ensure grandchild processes are restarted correctly (#113231)
When torchelastic notices that one rank has failed, it will sent a SIGTERM signal to other trainer ranks to tear them down before restarting. However, if the trainer itself launches subprocesses, or is launched by a non-python wrapper script, then the SIGTERM will be delivered only to the direct child of torch eleastic and not all descendants. This opens subprocesses in a new linux 'session' which starts a new process group with the pgid the same as the trainers pid. Then when we send signals, we deliver them to the process group rather than just the direct child. Pull Request resolved: https://github.com/pytorch/pytorch/pull/113231 Approved by: https://github.com/H-Huang
This commit is contained in:
committed by
PyTorch MergeBot
parent
958f3b0df6
commit
d968c4cac3
@ -562,6 +562,9 @@ class SubprocessHandler:
|
||||
self.proc: subprocess.Popen = self._popen(args_str, env_vars)
|
||||
|
||||
def _popen(self, args: Tuple, env: Dict[str, str]) -> subprocess.Popen:
|
||||
kwargs: Dict[str, Any] = {}
|
||||
if not IS_WINDOWS:
|
||||
kwargs['start_new_session'] = True
|
||||
return subprocess.Popen(
|
||||
# pyre-fixme[6]: Expected `Union[typing.Sequence[Union[_PathLike[bytes],
|
||||
# _PathLike[str], bytes, str]], bytes, str]` for 1st param but got
|
||||
@ -570,12 +573,16 @@ class SubprocessHandler:
|
||||
env=env,
|
||||
stdout=self._stdout,
|
||||
stderr=self._stderr,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
def close(self, death_sig: Optional[signal.Signals] = None) -> None:
|
||||
if not death_sig:
|
||||
death_sig = _get_default_signal()
|
||||
self.proc.send_signal(death_sig)
|
||||
if IS_WINDOWS:
|
||||
self.proc.send_signal(death_sig)
|
||||
else:
|
||||
os.killpg(self.proc.pid, death_sig)
|
||||
if self._stdout:
|
||||
self._stdout.close()
|
||||
if self._stderr:
|
||||
|
Reference in New Issue
Block a user