mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
Add wait counter on cuda::device_synchronize (#138883)
The wait counter is typically only minute precision, but if there is a collective in the queue it will show up. We think this explains up to eight minutes of delay in some compile traces we're looking at, but the counter would definitively prove it. Signed-off-by: Edward Z. Yang <ezyang@meta.com> Differential Revision: [D64944970](https://our.internmc.facebook.com/intern/diff/D64944970) Pull Request resolved: https://github.com/pytorch/pytorch/pull/138883 Approved by: https://github.com/eqy
This commit is contained in:
committed by
PyTorch MergeBot
parent
dbbdfd9df5
commit
03f9136870
@ -1,5 +1,6 @@
|
||||
#include <c10/cuda/CUDAFunctions.h>
|
||||
#include <c10/macros/Macros.h>
|
||||
#include <c10/util/WaitCounter.h>
|
||||
|
||||
#include <limits>
|
||||
|
||||
@ -138,6 +139,7 @@ void device_synchronize() {
|
||||
if (C10_UNLIKELY(interp)) {
|
||||
(*interp)->trace_gpu_device_synchronization(c10::kCUDA);
|
||||
}
|
||||
STATIC_SCOPED_WAIT_COUNTER(pytorch.wait_counter.cuda_device_synchronize);
|
||||
C10_CUDA_CHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user