Add wait counter on cuda::device_synchronize (#138883)

The wait counter is typically only minute precision, but if there is a collective in the queue it will show up. We think this explains up to eight minutes of delay in some compile traces we're looking at, but the counter would definitively prove it. Signed-off-by: Edward Z. Yang <ezyang@meta.com> Differential Revision: [D64944970](https://our.internmc.facebook.com/intern/diff/D64944970) Pull Request resolved: https://github.com/pytorch/pytorch/pull/138883 Approved by: https://github.com/eqy
2025-10-20 21:14:14 +08:00 · 2024-10-24 20:26:27 -07:00
parent dbbdfd9df5
commit 03f9136870
1 changed files with 2 additions and 0 deletions
--- a/c10/cuda/CUDAFunctions.cpp
+++ b/c10/cuda/CUDAFunctions.cpp
@ -1,5 +1,6 @@
 #include <c10/cuda/CUDAFunctions.h>
 #include <c10/macros/Macros.h>
+#include <c10/util/WaitCounter.h>

 #include <limits>

@ -138,6 +139,7 @@ void device_synchronize() {
  if (C10_UNLIKELY(interp)) {
    (*interp)->trace_gpu_device_synchronization(c10::kCUDA);
  }
+  STATIC_SCOPED_WAIT_COUNTER(pytorch.wait_counter.cuda_device_synchronize);
  C10_CUDA_CHECK(cudaDeviceSynchronize());
 }