From fe0e9fb385d6e75a01c9feaa459252db09b71839 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 20 Sep 2024 02:56:47 +0000 Subject: [PATCH] Fix flaky SIGSEGV crash in test_profile_memory (#136304) Fixes https://github.com/pytorch/pytorch/issues/132331 We need another barrier here to ensure that the main thread doesn't stop the profiler while other threads are still using it (and crash). I can reliably reproduce the issue with `pytest -v test/profiler/test_cpp_thread.py -k test_profile_memory --flake-finder`. ### Testing `pytest -v test/profiler/test_cpp_thread.py --flake-finder` all passes. Pull Request resolved: https://github.com/pytorch/pytorch/pull/136304 Approved by: https://github.com/briancoutinho --- test/profiler/test_cpp_thread.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/profiler/test_cpp_thread.cpp b/test/profiler/test_cpp_thread.cpp index ce60d9c816c6..58792313a90b 100644 --- a/test/profiler/test_cpp_thread.cpp +++ b/test/profiler/test_cpp_thread.cpp @@ -47,6 +47,8 @@ void start_threads(int thread_count, int iteration_count, bool attach) { static std::atomic barrier = 0; barrier = 0; + static std::atomic another_barrier = 0; + another_barrier = 0; thread_local bool enabled_in_main_thread = false; std::vector threads; @@ -78,6 +80,14 @@ void start_threads(int thread_count, int iteration_count, bool attach) { } ProfilerEventHandler::Handler->emulateTraining(iteration, id); + + // We need another barrier here to ensure that the main thread doesn't + // stop the profiler while other threads are still using it. This fixes + // https://github.com/pytorch/pytorch/issues/132331 + ++another_barrier; + while (another_barrier % thread_count) { + std::this_thread::yield(); + } } }); }