mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
[c10d] Flush file in file recorder (#145458)
Summary: Flushing file to hopefully prevent file corruptions as reported in https://github.com/pytorch/pytorch/pull/145125 Test Plan: Couldn't get file corruption to occur in my tests. Pull Request resolved: https://github.com/pytorch/pytorch/pull/145458 Approved by: https://github.com/kwen2501
This commit is contained in:
committed by
PyTorch MergeBot
parent
5534c270db
commit
3ce68dc61e
@ -159,18 +159,26 @@ void DebugInfoWriter::write(const std::string& trace) {
|
|||||||
|
|
||||||
// Check if the file was opened successfully.
|
// Check if the file was opened successfully.
|
||||||
if (!file.is_open()) {
|
if (!file.is_open()) {
|
||||||
LOG(ERROR) << "Error opening file for writing NCCLPG debug info: "
|
LOG(ERROR) << "Error opening file for writing Flight Recorder debug info: "
|
||||||
<< filename_;
|
<< filename_;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
file.write(trace.data(), static_cast<std::streamsize>(trace.size()));
|
if (!file.write(trace.data(), static_cast<std::streamsize>(trace.size()))) {
|
||||||
if (!file) {
|
const auto bad = file.bad();
|
||||||
LOG(ERROR) << "Error opening file for writing NCCLPG debug info: "
|
LOG(ERROR) << "Error writing Flight Recorder debug info to file: "
|
||||||
<< filename_;
|
<< filename_ << " bad bit: " << bad;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
LOG(INFO) << "Finished writing NCCLPG debug info to " << filename_;
|
|
||||||
|
// Flush the buffer to ensure data is written to the file
|
||||||
|
file.flush();
|
||||||
|
if (file.bad()) {
|
||||||
|
LOG(ERROR) << "Error flushing Flight Recorder debug info: " << filename_;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG(INFO) << "Finished writing Flight Recorder debug info to " << filename_;
|
||||||
}
|
}
|
||||||
|
|
||||||
DebugInfoWriter& DebugInfoWriter::getWriter(int rank) {
|
DebugInfoWriter& DebugInfoWriter::getWriter(int rank) {
|
||||||
|
Reference in New Issue
Block a user