[c10d] Flush file in file recorder (#145458)

Summary:
Flushing file to hopefully prevent file corruptions as reported in
https://github.com/pytorch/pytorch/pull/145125

Test Plan:
Couldn't get file corruption to occur in my tests.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/145458
Approved by: https://github.com/kwen2501
This commit is contained in:
Chirag Pandya
2025-01-27 19:57:44 +00:00
committed by PyTorch MergeBot
parent 5534c270db
commit 3ce68dc61e

View File

@ -159,18 +159,26 @@ void DebugInfoWriter::write(const std::string& trace) {
// Check if the file was opened successfully. // Check if the file was opened successfully.
if (!file.is_open()) { if (!file.is_open()) {
LOG(ERROR) << "Error opening file for writing NCCLPG debug info: " LOG(ERROR) << "Error opening file for writing Flight Recorder debug info: "
<< filename_; << filename_;
return; return;
} }
file.write(trace.data(), static_cast<std::streamsize>(trace.size())); if (!file.write(trace.data(), static_cast<std::streamsize>(trace.size()))) {
if (!file) { const auto bad = file.bad();
LOG(ERROR) << "Error opening file for writing NCCLPG debug info: " LOG(ERROR) << "Error writing Flight Recorder debug info to file: "
<< filename_; << filename_ << " bad bit: " << bad;
return; return;
} }
LOG(INFO) << "Finished writing NCCLPG debug info to " << filename_;
// Flush the buffer to ensure data is written to the file
file.flush();
if (file.bad()) {
LOG(ERROR) << "Error flushing Flight Recorder debug info: " << filename_;
return;
}
LOG(INFO) << "Finished writing Flight Recorder debug info to " << filename_;
} }
DebugInfoWriter& DebugInfoWriter::getWriter(int rank) { DebugInfoWriter& DebugInfoWriter::getWriter(int rank) {