mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 05:34:18 +08:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/57454 DDP with NCCL AllReduce for the entire model experiment from Quip https://fb.quip.com/iQUtAeKIxWpF I have been testing this on the AI cluster. There seem to be some connection problems with RPC when using multiple trainers or parameter servers. ``` Namespace(bconfig_id='3', dconfig_id='DummyData', mconfig_id='DummyModel', pconfig_id='None', tconfig_id='DdpNcclTrainer') benchmark warmup done metrics for trainer=0 +-----------------------------------+----------+---------+----------+------------+-----------+ | name | min | max | mean | variance | stdev | +===================================+==========+=========+==========+============+===========+ | backward_metric,backward | 2.45248 | 4.18304 | 3.972 | 0.097122 | 0.311644 | +-----------------------------------+----------+---------+----------+------------+-----------+ | batch_level_metric,batch_all | 4.11955 | 4.58138 | 4.31439 | 0.00229848 | 0.0479424 | +-----------------------------------+----------+---------+----------+------------+-----------+ | foward_metric,forward_pass | 0.141312 | 1.4807 | 0.222566 | 0.0555432 | 0.235676 | +-----------------------------------+----------+---------+----------+------------+-----------+ | hook_future_metric,nccl_allreduce | 0.191488 | 3.54099 | 3.11694 | 0.557106 | 0.746395 | +-----------------------------------+----------+---------+----------+------------+-----------+ metrics for trainer=1 +-----------------------------------+----------+---------+----------+-------------+------------+ | name | min | max | mean | variance | stdev | +===================================+==========+=========+==========+=============+============+ | backward_metric,backward | 2.4617 | 2.59174 | 2.51196 | 0.000938276 | 0.0306313 | +-----------------------------------+----------+---------+----------+-------------+------------+ | batch_level_metric,batch_all | 4.22605 | 4.71757 | 4.27921 | 0.00468424 | 0.0684415 | +-----------------------------------+----------+---------+----------+-------------+------------+ | foward_metric,forward_pass | 0.807936 | 1.50118 | 0.846008 | 0.00601693 | 0.0775688 | +-----------------------------------+----------+---------+----------+-------------+------------+ | hook_future_metric,nccl_allreduce | 0.108544 | 0.1536 | 0.11222 | 2.16726e-05 | 0.00465538 | +-----------------------------------+----------+---------+----------+-------------+------------+ metrics for all trainer +-----------------------------------+----------+---------+----------+------------+-----------+ | name | min | max | mean | variance | stdev | +===================================+==========+=========+==========+============+===========+ | backward_metric,backward | 2.45248 | 4.18304 | 3.24198 | 0.584391 | 0.764455 | +-----------------------------------+----------+---------+----------+------------+-----------+ | batch_level_metric,batch_all | 4.11955 | 4.71757 | 4.2968 | 0.00378467 | 0.0615197 | +-----------------------------------+----------+---------+----------+------------+-----------+ | foward_metric,forward_pass | 0.141312 | 1.50118 | 0.534287 | 0.128284 | 0.358167 | +-----------------------------------+----------+---------+----------+------------+-----------+ | hook_future_metric,nccl_allreduce | 0.108544 | 3.54099 | 1.61458 | 2.5456 | 1.59549 | +-----------------------------------+----------+---------+----------+------------+-----------+ ``` Test Plan: Imported from OSS Reviewed By: H-Huang, ngimel Differential Revision: D28296175 Pulled By: gcramer23 fbshipit-source-id: 5dd208fc86f8b5558d7c8860d685bb25c2e09fe7
24 lines
529 B
Python
24 lines
529 B
Python
import time
|
|
|
|
from .MetricBase import MetricBase
|
|
|
|
|
|
class CPUMetric(MetricBase):
|
|
def __init__(self, name: str):
|
|
self.name = name
|
|
self.start = None
|
|
self.end = None
|
|
|
|
def record_start(self):
|
|
self.start = time.time()
|
|
|
|
def record_end(self):
|
|
self.end = time.time()
|
|
|
|
def elapsed_time(self):
|
|
if self.start is None:
|
|
raise RuntimeError("start is None")
|
|
if self.end is None:
|
|
raise RuntimeError("end is None")
|
|
return self.end - self.start
|