From 3ce68dc61ef1ffd5428db895cae4d9c51c9c4f72 Mon Sep 17 00:00:00 2001 From: Chirag Pandya Date: Mon, 27 Jan 2025 19:57:44 +0000 Subject: [PATCH] [c10d] Flush file in file recorder (#145458) Summary: Flushing file to hopefully prevent file corruptions as reported in https://github.com/pytorch/pytorch/pull/145125 Test Plan: Couldn't get file corruption to occur in my tests. Pull Request resolved: https://github.com/pytorch/pytorch/pull/145458 Approved by: https://github.com/kwen2501 --- .../csrc/distributed/c10d/FlightRecorder.cpp | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/torch/csrc/distributed/c10d/FlightRecorder.cpp b/torch/csrc/distributed/c10d/FlightRecorder.cpp index 948c3fef2ce4..397dd84ef240 100644 --- a/torch/csrc/distributed/c10d/FlightRecorder.cpp +++ b/torch/csrc/distributed/c10d/FlightRecorder.cpp @@ -159,18 +159,26 @@ void DebugInfoWriter::write(const std::string& trace) { // Check if the file was opened successfully. if (!file.is_open()) { - LOG(ERROR) << "Error opening file for writing NCCLPG debug info: " + LOG(ERROR) << "Error opening file for writing Flight Recorder debug info: " << filename_; return; } - file.write(trace.data(), static_cast(trace.size())); - if (!file) { - LOG(ERROR) << "Error opening file for writing NCCLPG debug info: " - << filename_; + if (!file.write(trace.data(), static_cast(trace.size()))) { + const auto bad = file.bad(); + LOG(ERROR) << "Error writing Flight Recorder debug info to file: " + << filename_ << " bad bit: " << bad; return; } - LOG(INFO) << "Finished writing NCCLPG debug info to " << filename_; + + // Flush the buffer to ensure data is written to the file + file.flush(); + if (file.bad()) { + LOG(ERROR) << "Error flushing Flight Recorder debug info: " << filename_; + return; + } + + LOG(INFO) << "Finished writing Flight Recorder debug info to " << filename_; } DebugInfoWriter& DebugInfoWriter::getWriter(int rank) {