[c10d] Flush file in file recorder (#145458)

Summary:
Flushing file to hopefully prevent file corruptions as reported in
https://github.com/pytorch/pytorch/pull/145125

Test Plan:
Couldn't get file corruption to occur in my tests.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/145458
Approved by: https://github.com/kwen2501
This commit is contained in:
Chirag Pandya
2025-01-27 19:57:44 +00:00
committed by PyTorch MergeBot
parent 5534c270db
commit 3ce68dc61e

View File

@ -159,18 +159,26 @@ void DebugInfoWriter::write(const std::string& trace) {
// Check if the file was opened successfully.
if (!file.is_open()) {
LOG(ERROR) << "Error opening file for writing NCCLPG debug info: "
LOG(ERROR) << "Error opening file for writing Flight Recorder debug info: "
<< filename_;
return;
}
file.write(trace.data(), static_cast<std::streamsize>(trace.size()));
if (!file) {
LOG(ERROR) << "Error opening file for writing NCCLPG debug info: "
<< filename_;
if (!file.write(trace.data(), static_cast<std::streamsize>(trace.size()))) {
const auto bad = file.bad();
LOG(ERROR) << "Error writing Flight Recorder debug info to file: "
<< filename_ << " bad bit: " << bad;
return;
}
LOG(INFO) << "Finished writing NCCLPG debug info to " << filename_;
// Flush the buffer to ensure data is written to the file
file.flush();
if (file.bad()) {
LOG(ERROR) << "Error flushing Flight Recorder debug info: " << filename_;
return;
}
LOG(INFO) << "Finished writing Flight Recorder debug info to " << filename_;
}
DebugInfoWriter& DebugInfoWriter::getWriter(int rank) {