Compare commits

...

1 Commits

Author SHA1 Message Date
2505ba2717 debug pg 2025-10-16 08:08:22 -07:00
4 changed files with 58 additions and 2 deletions

View File

@ -7,7 +7,23 @@ namespace c10d {
void DebugInfoWriter::write(const std::string& trace) {
std::string filename = filename_;
if (enable_dynamic_filename_) {
LOG(INFO) << "Writing Flight Recorder debug info to a dynamic file name";
filename = c10::str(getCvarString({"TORCH_FR_DUMP_TEMP_FILE"}, ""), rank_);
// Check if filename contains a "/" and create directory if needed
size_t slashPos = filename.find_last_of('/');
if (slashPos != std::string::npos) {
// Extract directory path (everything before the last '/')
std::string dirPath = filename.substr(0, slashPos);
try {
c10::filesystem::create_directories(dirPath);
LOG(INFO) << "Created directory for Flight Recorder output: " << dirPath;
} catch (const std::exception& e) {
LOG(ERROR) << "Failed to create directory " << dirPath << ": " << e.what();
}
}
} else {
LOG(INFO) << "Writing Flight Recorder debug info to a static file name";
}
// Open a file for writing. The ios::binary flag is used to write data as
// binary.

View File

@ -214,7 +214,12 @@ void FlightRecorder<EventType>::retire_id(
std::unique_lock<std::mutex> guard(mutex_);
Entry* entry = &entries_.at(*id % max_entries_);
auto idx = *id % max_entries_;
if (entries_.size() <= idx) {
return;
}
Entry* entry = &entries_.at(idx);
if (entry->id_ == *id) {
update_state(*entry);

View File

@ -14,6 +14,7 @@
#include <c10/cuda/CUDAAllocatorConfig.h>
#include <c10/cuda/CUDAGraphsC10Utils.h>
#include <c10/cuda/CUDAGuard.h>
#include <c10/util/Backtrace.h>
#include <c10/util/Exception.h>
#include <c10/util/Logging.h>
#include <c10/util/WaitCounter.h>
@ -31,6 +32,7 @@
#include <torch/csrc/distributed/c10d/TraceUtils.h>
#include <torch/csrc/distributed/c10d/Utils.hpp>
#include <torch/csrc/distributed/c10d/cuda/utils.hpp>
#include <torch/csrc/profiler/combined_traceback.h>
#include <torch/torch.h>
#include <optional>
@ -68,6 +70,16 @@ inline bool isUnsupportedFloat8(at::ScalarType t) {
);
}
void print_traceback(const torch::SymbolizedTracebacks& st,
size_t traceback_index) {
const std::vector<uint64_t>& traceback = st.tracebacks[traceback_index];
for (uint64_t idx : traceback) {
const torch::unwind::Frame& frame = st.all_frames[idx];
LOG(ERROR) << " File \"" << frame.filename << "\", line "
<< frame.lineno << ", in " << frame.funcname << "\n";
}
}
#ifdef ENABLE_NCCL_PREMUL_SUM_SUPPORT
template <typename T, ncclDataType_t dataType>
ncclRedOpRAII unpackPreMulSum(
@ -2090,6 +2102,19 @@ void ProcessGroupNCCL::Watchdog::run() {
"Process group watchdog thread terminated with exception: ",
e.what());
LOG(ERROR) << exitMsg;
LOG(ERROR) << "Backtrace:";
LOG(ERROR) << c10::get_lazy_backtrace();
std::shared_ptr<torch::CapturedTraceback> tb0 =
torch::CapturedTraceback::gather(/*python=*/true, /*script=*/true, /*cpp=*/true);
std::shared_ptr<torch::CapturedTraceback> tb1 =
torch::CapturedTraceback::gather(/*python=*/true, /*script=*/true, /*cpp=*/true);
torch::SymbolizedTracebacks r = torch::symbolize({tb0.get(), tb1.get()});
print_traceback(r, 0);
print_traceback(r, 1);
std::abort();
if (C10_LIKELY(rethrowCUDAErrors_) ||
!(std::string(e.what()).find("CUDA Error"))) {
// TODO(whc) clean up the rethrow - why is it stored in a class var and

View File

@ -195,10 +195,17 @@ static std::vector<std::string> TORCH_NCCL_USE_TENSOR_REGISTER_ALLOCATOR_HOOK =
#if defined(__linux__)
struct DumpPipe {
DumpPipe(int rank) {
LOG(INFO) << "DumpPipe works! Rank: " << rank;
std::string fileStem =
getCvarString({"TORCH_NCCL_DEBUG_INFO_PIPE_FILE"}, "");
if (fileStem.empty() ||
getCvarInt({"TORCH_NCCL_TRACE_BUFFER_SIZE"}, 0) <= 0) {
if (fileStem.empty()) {
LOG(INFO) << "DumpPipe is not enabled. Empty file";
}
if (getCvarInt({"TORCH_NCCL_TRACE_BUFFER_SIZE"}, 0) <= 0) {
LOG(INFO) << "DumpPipe is not enabled. Trace buffer size is 0";
}
return;
}
TORCH_CHECK(!fileStem.empty(), "TORCH_NCCL_DEBUG_INFO_PIPE_FILE is empty");
@ -243,8 +250,11 @@ struct DumpPipe {
};
#else
struct DumpPipe {
DumpPipe(int rank) {}
DumpPipe(int rank) {
LOG(INFO) << rank << ": DumpPipe is only supported on Linux.";
}
bool shouldDump() {
LOG(INFO) << "Cannot dump. DumpPipe is only supported on Linux";
return false;
}
};