mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
Closes #164529 To expose the new [ncclCommShrink](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclcommshrink) API to PyTorch. This is useful when you need to exclude certain GPUs or nodes from a collective operation, for example in fault tolerance scenarios or when dynamically adjusting resource utilization. For more info: [Shrinking a communicator](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/communicators.html#shrinking-a-communicator) Pull Request resolved: https://github.com/pytorch/pytorch/pull/164518 Approved by: https://github.com/kwen2501
44 lines
1.0 KiB
Python
44 lines
1.0 KiB
Python
import logging
|
|
import time
|
|
|
|
|
|
_start_time = time.time()
|
|
_logger = logging.getLogger(__name__)
|
|
|
|
|
|
def _ts():
|
|
return time.time() - _start_time
|
|
|
|
|
|
def configure(level=logging.INFO, force=False):
|
|
try:
|
|
logging.basicConfig(
|
|
level=level,
|
|
format="%(asctime)s %(name)s %(levelname)s: %(message)s",
|
|
force=force,
|
|
)
|
|
except TypeError:
|
|
logging.basicConfig(
|
|
level=level, format="%(asctime)s %(name)s %(levelname)s: %(message)s"
|
|
)
|
|
|
|
|
|
def log_test_info(rank, message):
|
|
_logger.info("[%7.3fs][Rank %s] %s", _ts(), rank, message)
|
|
|
|
|
|
def log_test_success(rank, message):
|
|
_logger.info("[%7.3fs][Rank %s] ✅ %s", _ts(), rank, message)
|
|
|
|
|
|
def log_test_validation(rank, message):
|
|
_logger.info("[%7.3fs][Rank %s] ✓ %s", _ts(), rank, message)
|
|
|
|
|
|
def log_test_warning(rank, message):
|
|
_logger.warning("[%7.3fs][Rank %s] ⚠️ %s", _ts(), rank, message)
|
|
|
|
|
|
def log_test_error(rank, message):
|
|
_logger.error("[%7.3fs][Rank %s] ✗ %s", _ts(), rank, message)
|