diff --git a/torch/csrc/distributed/c10d/FlightRecorder.cpp b/torch/csrc/distributed/c10d/FlightRecorder.cpp index 948c3fef2ce4..397dd84ef240 100644 --- a/torch/csrc/distributed/c10d/FlightRecorder.cpp +++ b/torch/csrc/distributed/c10d/FlightRecorder.cpp @@ -159,18 +159,26 @@ void DebugInfoWriter::write(const std::string& trace) { // Check if the file was opened successfully. if (!file.is_open()) { - LOG(ERROR) << "Error opening file for writing NCCLPG debug info: " + LOG(ERROR) << "Error opening file for writing Flight Recorder debug info: " << filename_; return; } - file.write(trace.data(), static_cast(trace.size())); - if (!file) { - LOG(ERROR) << "Error opening file for writing NCCLPG debug info: " - << filename_; + if (!file.write(trace.data(), static_cast(trace.size()))) { + const auto bad = file.bad(); + LOG(ERROR) << "Error writing Flight Recorder debug info to file: " + << filename_ << " bad bit: " << bad; return; } - LOG(INFO) << "Finished writing NCCLPG debug info to " << filename_; + + // Flush the buffer to ensure data is written to the file + file.flush(); + if (file.bad()) { + LOG(ERROR) << "Error flushing Flight Recorder debug info: " << filename_; + return; + } + + LOG(INFO) << "Finished writing Flight Recorder debug info to " << filename_; } DebugInfoWriter& DebugInfoWriter::getWriter(int rank) {