[BE] minor logging cleanup in distributed (#122921)

Summary:
    Minor logging cleanup in distributed library
    1. Don't use "f" formatted strings - address linter issues.
    2. Nits: Make use of unused `e` (error) in a few logs.
    3. Change info->debug as asked in issue #113545
    4. Nit: rename log -> logger in a few files for consistency
    5. Fix a linter error.

    Test Plan:
    1. Local build passes.
    2. Linter is happy.

    Reviewers: wanchaol

Pull Request resolved: https://github.com/pytorch/pytorch/pull/122921
Approved by: https://github.com/wanchaol
This commit is contained in:
Chirag Pandya
2024-03-29 03:33:58 +00:00
committed by PyTorch MergeBot
parent 6a45809580
commit b6201a60c5
25 changed files with 195 additions and 201 deletions

View File

@ -38,7 +38,7 @@ IS_WINDOWS = sys.platform == "win32"
IS_MACOS = sys.platform == "darwin"
log = logging.getLogger(__name__)
logger = logging.getLogger(__name__)
__all__ = [
"DefaultLogsSpecs",
@ -260,7 +260,7 @@ class DefaultLogsSpecs(LogsSpecs):
base_log_dir = log_dir or tempfile.mkdtemp(prefix="torchelastic_")
os.makedirs(base_log_dir, exist_ok=True)
dir = tempfile.mkdtemp(prefix=f"{rdzv_run_id}_", dir=base_log_dir)
log.info("log directory set to: %s", dir)
logger.info("log directory set to: %s", dir)
return dir
def reify(self, envs: Dict[int, Dict[str, str]],) -> LogsDest:
@ -276,7 +276,7 @@ class DefaultLogsSpecs(LogsSpecs):
if nprocs > 0:
global_env = envs[0]
else:
log.warning("Empty envs map provided when defining logging destinations.")
logger.warning("Empty envs map provided when defining logging destinations.")
# Keys are always defined, but values can be missing in unit tests
run_id = global_env.get("TORCHELASTIC_RUN_ID", "test_run_id")
restart_count = global_env.get("TORCHELASTIC_RESTART_COUNT", "0")
@ -355,7 +355,7 @@ class DefaultLogsSpecs(LogsSpecs):
error_file = os.path.join(clogdir, "error.json")
error_files[local_rank] = error_file
log.info("Setting worker%s reply file to: %s", local_rank, error_file)
logger.info("Setting worker%s reply file to: %s", local_rank, error_file)
envs[local_rank]["TORCHELASTIC_ERROR_FILE"] = error_file
return LogsDest(stdouts, stderrs, tee_stdouts, tee_stderrs, error_files)
@ -692,7 +692,7 @@ class MultiprocessContext(PContext):
failed_proc = self._pc.processes[failed_local_rank]
error_filepath = self.error_files[failed_local_rank]
log.exception(
logger.exception(
"failed (exitcode: %s)"
" local_rank: %s (pid: %s)"
" of fn: %s (start_method: %s)",
@ -724,7 +724,7 @@ class MultiprocessContext(PContext):
return
for proc in self._pc.processes:
if proc.is_alive():
log.warning("Closing process %s via signal %s", proc.pid, death_sig.name)
logger.warning("Closing process %s via signal %s", proc.pid, death_sig.name)
try:
os.kill(proc.pid, death_sig)
except ProcessLookupError:
@ -739,7 +739,7 @@ class MultiprocessContext(PContext):
proc.join(time_to_wait)
for proc in self._pc.processes:
if proc.is_alive():
log.warning(
logger.warning(
"Unable to shutdown process %s via %s, forcefully exiting via %s",
proc.pid, death_sig, _get_kill_signal()
)
@ -823,7 +823,7 @@ class SubprocessContext(PContext):
)
if result.is_failed():
first_failure = min(result.failures.values(), key=lambda f: f.timestamp)
log.error(
logger.error(
"failed (exitcode: %s)"
" local_rank: %s (pid: %s)"
" of binary: %s",
@ -848,7 +848,7 @@ class SubprocessContext(PContext):
return
for handler in self.subprocess_handlers.values():
if handler.proc.poll() is None:
log.warning(
logger.warning(
"Sending process %s closing signal %s", handler.proc.pid, death_sig.name
)
handler.close(death_sig=death_sig)
@ -865,7 +865,7 @@ class SubprocessContext(PContext):
pass
for handler in self.subprocess_handlers.values():
if handler.proc.poll() is None:
log.warning(
logger.warning(
"Unable to shutdown process %s via %s, forcefully exiting via %s",
handler.proc.pid, death_sig, _get_kill_signal()
)