Fix flaky SIGSEGV crash in test_profile_memory (#136304)

Fixes https://github.com/pytorch/pytorch/issues/132331 We need another barrier here to ensure that the main thread doesn't stop the profiler while other threads are still using it (and crash). I can reliably reproduce the issue with `pytest -v test/profiler/test_cpp_thread.py -k test_profile_memory --flake-finder`. ### Testing `pytest -v test/profiler/test_cpp_thread.py --flake-finder` all passes. Pull Request resolved: https://github.com/pytorch/pytorch/pull/136304 Approved by: https://github.com/briancoutinho
2025-10-20 21:14:14 +08:00 · 2024-09-20 02:56:47 +00:00
parent d45b0151e5
commit fe0e9fb385
1 changed files with 10 additions and 0 deletions
--- a/test/profiler/test_cpp_thread.cpp
+++ b/test/profiler/test_cpp_thread.cpp
@ -47,6 +47,8 @@ void start_threads(int thread_count, int iteration_count, bool attach) {

  static std::atomic<int> barrier = 0;
  barrier = 0;
+  static std::atomic<int> another_barrier = 0;
+  another_barrier = 0;
  thread_local bool enabled_in_main_thread = false;

  std::vector<std::thread> threads;
@ -78,6 +80,14 @@ void start_threads(int thread_count, int iteration_count, bool attach) {
        }

        ProfilerEventHandler::Handler->emulateTraining(iteration, id);
+
+        // We need another barrier here to ensure that the main thread doesn't
+        // stop the profiler while other threads are still using it. This fixes
+        // https://github.com/pytorch/pytorch/issues/132331
+        ++another_barrier;
+        while (another_barrier % thread_count) {
+          std::this_thread::yield();
+        }
      }
    });
  }