mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
[c10d] Flush file in file recorder (#145458)
Summary: Flushing file to hopefully prevent file corruptions as reported in https://github.com/pytorch/pytorch/pull/145125 Test Plan: Couldn't get file corruption to occur in my tests. Pull Request resolved: https://github.com/pytorch/pytorch/pull/145458 Approved by: https://github.com/kwen2501
This commit is contained in:
committed by
PyTorch MergeBot
parent
5534c270db
commit
3ce68dc61e
@ -159,18 +159,26 @@ void DebugInfoWriter::write(const std::string& trace) {
|
||||
|
||||
// Check if the file was opened successfully.
|
||||
if (!file.is_open()) {
|
||||
LOG(ERROR) << "Error opening file for writing NCCLPG debug info: "
|
||||
LOG(ERROR) << "Error opening file for writing Flight Recorder debug info: "
|
||||
<< filename_;
|
||||
return;
|
||||
}
|
||||
|
||||
file.write(trace.data(), static_cast<std::streamsize>(trace.size()));
|
||||
if (!file) {
|
||||
LOG(ERROR) << "Error opening file for writing NCCLPG debug info: "
|
||||
<< filename_;
|
||||
if (!file.write(trace.data(), static_cast<std::streamsize>(trace.size()))) {
|
||||
const auto bad = file.bad();
|
||||
LOG(ERROR) << "Error writing Flight Recorder debug info to file: "
|
||||
<< filename_ << " bad bit: " << bad;
|
||||
return;
|
||||
}
|
||||
LOG(INFO) << "Finished writing NCCLPG debug info to " << filename_;
|
||||
|
||||
// Flush the buffer to ensure data is written to the file
|
||||
file.flush();
|
||||
if (file.bad()) {
|
||||
LOG(ERROR) << "Error flushing Flight Recorder debug info: " << filename_;
|
||||
return;
|
||||
}
|
||||
|
||||
LOG(INFO) << "Finished writing Flight Recorder debug info to " << filename_;
|
||||
}
|
||||
|
||||
DebugInfoWriter& DebugInfoWriter::getWriter(int rank) {
|
||||
|
Reference in New Issue
Block a user