[wall_clock_breakdown] always log stats when enabled (#7617)

currently when main logger is WARN level, `wall_clock_breakdown: true`
never logs - which is invalid as it disables this crucial at times
functionality. Plus I think we have a disconnect somewhere since the
recently added `--log_level` flag doesn't seem to change this logger's
level.

The future plan is to be able to have different log levels for different
modules, but for now just use `print` if `wall_clock_breakdown` is
`True`, so this functionality is not log-level dependent.

`print` is also less noisy than the logger, because of the long prefix
generated by the latter, which is of no value to the user since we print
stats and not code related logs, so the printed results are easier to
digest.

Signed-off-by: Stas Bekman <stas@stason.org>
This commit is contained in:
Stas Bekman
2025-10-02 16:08:39 -07:00
committed by GitHub
parent e37c37acdd
commit 9cbd3edd0d
2 changed files with 8 additions and 3 deletions

View File

@ -83,7 +83,7 @@ def print_configuration(args, name):
logger.info(" {} {} {}".format(arg, dots, getattr(args, arg)))
def log_dist(message, ranks=None, level=logging.INFO):
def log_dist(message, ranks=None, level=logging.INFO, use_logger=True):
from deepspeed import comm as dist
"""Log message when one of following condition meets
@ -94,6 +94,7 @@ def log_dist(message, ranks=None, level=logging.INFO):
message (str)
ranks (list)
level (int)
use_logger (bool): if `False` ignores the log-levels and always prints
"""
should_log = not dist.is_initialized()
@ -104,7 +105,10 @@ def log_dist(message, ranks=None, level=logging.INFO):
should_log = should_log or (my_rank in set(ranks))
if should_log:
final_message = "[Rank {}] {}".format(my_rank, message)
logger.log(level, final_message)
if use_logger:
logger.log(level, final_message)
else:
print(final_message)
@functools.lru_cache(None)

View File

@ -148,7 +148,8 @@ class SynchronizedWallClockTimer:
elapsed_time = (self.timers[name].elapsed(reset=reset) / normalizer)
string += " | {}: {:.2f}".format(name, elapsed_time)
log_dist(string, ranks=ranks or [0])
# timers logging should be independent of the global log level it's already conditional on wall_clock_breakdown being True, so using use_logger=False will always print the stats
log_dist(string, ranks=ranks or [0], use_logger=False)
def get_mean(self, names, normalizer=1.0, reset=True):
"""Get the mean of a group of timers."""