mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 13:44:15 +08:00
[Distributed] Do not expose nlohmann/json.hpp
in public headers (#131925)
Move `<hlohmann/json.hpp>` dependency as well as `NCCLTraceBuffer::getCollectiveTraceJson` and `NCCLTraceBuffer::dump_json` implementation introduced by https://github.com/pytorch/pytorch/pull/129505 from the header into .cpp file. This relaxes the requirement on all downstream client to depend on the library Fixes https://github.com/pytorch/pytorch/issues/130678 Pull Request resolved: https://github.com/pytorch/pytorch/pull/131925 Approved by: https://github.com/albanD, https://github.com/d4l3k, https://github.com/fduwjj, https://github.com/c-p-i-o ghstack dependencies: #131922
This commit is contained in:
committed by
PyTorch MergeBot
parent
75c8d59ea1
commit
f901b02066
@ -12,6 +12,8 @@
|
|||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
|
|
||||||
|
#include <nlohmann/json.hpp>
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
constexpr int64_t kCommInitBusyWaitMillis = 10;
|
constexpr int64_t kCommInitBusyWaitMillis = 10;
|
||||||
} // namespace
|
} // namespace
|
||||||
@ -372,6 +374,97 @@ void DebugInfoWriter::registerWriter(std::unique_ptr<DebugInfoWriter> writer) {
|
|||||||
writer_ = std::move(writer);
|
writer_ = std::move(writer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string NCCLTraceBuffer::dump_json(
|
||||||
|
const std::optional<std::unordered_map<
|
||||||
|
std::string,
|
||||||
|
std::unordered_map<std::string, std::string>>>& ncclDumpMap,
|
||||||
|
bool includeCollectives,
|
||||||
|
bool onlyActive) {
|
||||||
|
using json = nlohmann::json;
|
||||||
|
json result;
|
||||||
|
result[version_key_str] = version_val_str;
|
||||||
|
result[pg_config_key_str] = getPgConfigJson();
|
||||||
|
result[pg_status_key_str] = getPgStatusJson();
|
||||||
|
|
||||||
|
// collective trace
|
||||||
|
if (includeCollectives) {
|
||||||
|
std::list<json> entries;
|
||||||
|
for (auto& e : dump_entries()) {
|
||||||
|
json j;
|
||||||
|
if (onlyActive && e.time_discovered_completed_.has_value()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
j[record_id_key_str] = int64_t(e.id_);
|
||||||
|
j[pg_id_key_str] = int64_t(e.pg_id_);
|
||||||
|
j[pg_name_key_str] = e.pg_name_;
|
||||||
|
j[collective_seq_id_key_str] = int64_t(e.collective_seq_id_);
|
||||||
|
j[p2p_seq_id_key_str] = int64_t(e.p2p_seq_id_);
|
||||||
|
j[op_id_key_str] = int64_t(e.op_id_);
|
||||||
|
j[profiling_name_key_str] = e.profiling_name_;
|
||||||
|
j[time_created_key_str] = int64_t(e.time_created_);
|
||||||
|
if (e.duration_) {
|
||||||
|
j[duration_key_str] = *e.duration_;
|
||||||
|
}
|
||||||
|
auto it = e.sizes_.begin();
|
||||||
|
auto read_sizes = [&](const c10::SmallVector<int, 4>& dims) {
|
||||||
|
auto sizes = std::list<std::list<int>>();
|
||||||
|
for (auto dim : dims) {
|
||||||
|
auto arg_sizes = std::list<int>();
|
||||||
|
for (auto i : c10::irange(dim)) {
|
||||||
|
(void)i;
|
||||||
|
arg_sizes.push_back(*it++);
|
||||||
|
}
|
||||||
|
sizes.push_back(arg_sizes);
|
||||||
|
}
|
||||||
|
return sizes;
|
||||||
|
};
|
||||||
|
j[input_sizes_key_str] = read_sizes(e.input_dims_);
|
||||||
|
std::vector<std::string> input_dtypes_strs;
|
||||||
|
input_dtypes_strs.reserve(e.input_dtypes_.size());
|
||||||
|
for (const auto& input_dtype : e.input_dtypes_) {
|
||||||
|
input_dtypes_strs.push_back(c10::toString(input_dtype));
|
||||||
|
}
|
||||||
|
j[input_dtypes_key_str] = input_dtypes_strs;
|
||||||
|
j[output_sizes_key_str] = read_sizes(e.output_dims_);
|
||||||
|
std::vector<std::string> output_dtypes_strs;
|
||||||
|
output_dtypes_strs.reserve(e.output_dtypes_.size());
|
||||||
|
for (const auto& output_dtype : e.output_dtypes_) {
|
||||||
|
output_dtypes_strs.push_back(c10::toString(output_dtype));
|
||||||
|
}
|
||||||
|
j[output_dtypes_key_str] = output_dtypes_strs;
|
||||||
|
if (e.time_discovered_completed_.has_value()) {
|
||||||
|
j[state_key_str] = completed_state_str;
|
||||||
|
} else if (e.time_discovered_started_.has_value()) {
|
||||||
|
j[state_key_str] = started_state_str;
|
||||||
|
} else {
|
||||||
|
j[state_key_str] = scheduled_state_str;
|
||||||
|
}
|
||||||
|
j[time_discovered_started_key_str] =
|
||||||
|
e.time_discovered_started_.has_value()
|
||||||
|
? int64_t(*e.time_discovered_started_)
|
||||||
|
: 0;
|
||||||
|
j[time_discovered_completed_key_str] =
|
||||||
|
e.time_discovered_completed_.has_value()
|
||||||
|
? int64_t(*e.time_discovered_completed_)
|
||||||
|
: 0;
|
||||||
|
j[retired_key_str] = e.retired_;
|
||||||
|
j[timeout_key_str] = e.timeout_ms_;
|
||||||
|
j[is_p2p_key_str] = e.isP2P_;
|
||||||
|
entries.emplace_back(j);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (entries.size() > 0) {
|
||||||
|
result[entries_key_str] = entries;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ncclDumpMap.has_value()) {
|
||||||
|
result[nccl_comm_key_str] = ncclDumpMap.value();
|
||||||
|
}
|
||||||
|
|
||||||
|
return result.dump();
|
||||||
|
}
|
||||||
|
|
||||||
std::unique_ptr<DebugInfoWriter> DebugInfoWriter::writer_ = nullptr;
|
std::unique_ptr<DebugInfoWriter> DebugInfoWriter::writer_ = nullptr;
|
||||||
std::atomic<bool> DebugInfoWriter::hasWriterRegistered_(false);
|
std::atomic<bool> DebugInfoWriter::hasWriterRegistered_(false);
|
||||||
|
|
||||||
|
@ -5,7 +5,6 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
|
||||||
#include <nlohmann/json.hpp>
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
@ -174,7 +173,6 @@
|
|||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
namespace c10d {
|
namespace c10d {
|
||||||
using json = nlohmann::json;
|
|
||||||
#define DEFINE_CONSTANT(name, value) \
|
#define DEFINE_CONSTANT(name, value) \
|
||||||
static c10::IValue name = value; \
|
static c10::IValue name = value; \
|
||||||
static std::string name##_str = value;
|
static std::string name##_str = value;
|
||||||
@ -820,73 +818,6 @@ struct NCCLTraceBuffer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::list<json> getCollectiveTraceJson(bool onlyActive) {
|
|
||||||
std::list<json> entries;
|
|
||||||
for (auto& e : dump_entries()) {
|
|
||||||
if (onlyActive && e.time_discovered_completed_.has_value()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
json j;
|
|
||||||
j[record_id_key_str] = int64_t(e.id_);
|
|
||||||
j[pg_id_key_str] = int64_t(e.pg_id_);
|
|
||||||
j[pg_name_key_str] = e.pg_name_;
|
|
||||||
j[collective_seq_id_key_str] = int64_t(e.collective_seq_id_);
|
|
||||||
j[p2p_seq_id_key_str] = int64_t(e.p2p_seq_id_);
|
|
||||||
j[op_id_key_str] = int64_t(e.op_id_);
|
|
||||||
j[profiling_name_key_str] = e.profiling_name_;
|
|
||||||
j[time_created_key_str] = int64_t(e.time_created_);
|
|
||||||
if (e.duration_) {
|
|
||||||
j[duration_key_str] = *e.duration_;
|
|
||||||
}
|
|
||||||
auto it = e.sizes_.begin();
|
|
||||||
auto read_sizes = [&](const c10::SmallVector<int, 4>& dims) {
|
|
||||||
auto sizes = std::list<std::list<int>>();
|
|
||||||
for (auto dim : dims) {
|
|
||||||
auto arg_sizes = std::list<int>();
|
|
||||||
for (C10_UNUSED auto i : c10::irange(dim)) {
|
|
||||||
arg_sizes.push_back(*it++);
|
|
||||||
}
|
|
||||||
sizes.push_back(arg_sizes);
|
|
||||||
}
|
|
||||||
return sizes;
|
|
||||||
};
|
|
||||||
j[input_sizes_key_str] = read_sizes(e.input_dims_);
|
|
||||||
std::vector<std::string> input_dtypes_strs;
|
|
||||||
input_dtypes_strs.reserve(e.input_dtypes_.size());
|
|
||||||
for (const auto& input_dtype : e.input_dtypes_) {
|
|
||||||
input_dtypes_strs.push_back(c10::toString(input_dtype));
|
|
||||||
}
|
|
||||||
j[input_dtypes_key_str] = input_dtypes_strs;
|
|
||||||
j[output_sizes_key_str] = read_sizes(e.output_dims_);
|
|
||||||
std::vector<std::string> output_dtypes_strs;
|
|
||||||
output_dtypes_strs.reserve(e.output_dtypes_.size());
|
|
||||||
for (const auto& output_dtype : e.output_dtypes_) {
|
|
||||||
output_dtypes_strs.push_back(c10::toString(output_dtype));
|
|
||||||
}
|
|
||||||
j[output_dtypes_key_str] = output_dtypes_strs;
|
|
||||||
if (e.time_discovered_completed_.has_value()) {
|
|
||||||
j[state_key_str] = completed_state_str;
|
|
||||||
} else if (e.time_discovered_started_.has_value()) {
|
|
||||||
j[state_key_str] = started_state_str;
|
|
||||||
} else {
|
|
||||||
j[state_key_str] = scheduled_state_str;
|
|
||||||
}
|
|
||||||
j[time_discovered_started_key_str] =
|
|
||||||
e.time_discovered_started_.has_value()
|
|
||||||
? int64_t(*e.time_discovered_started_)
|
|
||||||
: 0;
|
|
||||||
j[time_discovered_completed_key_str] =
|
|
||||||
e.time_discovered_completed_.has_value()
|
|
||||||
? int64_t(*e.time_discovered_completed_)
|
|
||||||
: 0;
|
|
||||||
j[retired_key_str] = e.retired_;
|
|
||||||
j[timeout_key_str] = e.timeout_ms_;
|
|
||||||
j[is_p2p_key_str] = e.isP2P_;
|
|
||||||
entries.emplace_back(j);
|
|
||||||
}
|
|
||||||
return entries;
|
|
||||||
}
|
|
||||||
|
|
||||||
const c10::List<c10::IValue> getCollectiveTrace(
|
const c10::List<c10::IValue> getCollectiveTrace(
|
||||||
bool includeStacktraces,
|
bool includeStacktraces,
|
||||||
bool onlyActive) {
|
bool onlyActive) {
|
||||||
@ -941,8 +872,7 @@ struct NCCLTraceBuffer {
|
|||||||
auto sizes = new_list();
|
auto sizes = new_list();
|
||||||
for (auto dim : dims) {
|
for (auto dim : dims) {
|
||||||
auto arg_sizes = new_list();
|
auto arg_sizes = new_list();
|
||||||
for (auto i : c10::irange(dim)) {
|
for (C10_UNUSED auto i : c10::irange(dim)) {
|
||||||
(void)i;
|
|
||||||
arg_sizes.push_back(*it++);
|
arg_sizes.push_back(*it++);
|
||||||
}
|
}
|
||||||
sizes.push_back(arg_sizes);
|
sizes.push_back(arg_sizes);
|
||||||
@ -1051,27 +981,7 @@ struct NCCLTraceBuffer {
|
|||||||
std::string,
|
std::string,
|
||||||
std::unordered_map<std::string, std::string>>>& ncclDumpMap,
|
std::unordered_map<std::string, std::string>>>& ncclDumpMap,
|
||||||
bool includeCollectives,
|
bool includeCollectives,
|
||||||
bool onlyActive) {
|
bool onlyActive);
|
||||||
json result;
|
|
||||||
result[version_key_str] = version_val_str;
|
|
||||||
result[pg_config_key_str] = getPgConfigJson();
|
|
||||||
result[pg_status_key_str] = getPgStatusJson();
|
|
||||||
|
|
||||||
// collective trace
|
|
||||||
if (includeCollectives) {
|
|
||||||
auto entries = getCollectiveTraceJson(onlyActive);
|
|
||||||
if (entries.size() > 0) {
|
|
||||||
result[entries_key_str] = entries;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ncclDumpMap.has_value()) {
|
|
||||||
result[nccl_comm_key_str] = ncclDumpMap.value();
|
|
||||||
}
|
|
||||||
|
|
||||||
return result.dump();
|
|
||||||
}
|
|
||||||
|
|
||||||
// dump all collectives + ncclDumpMap
|
// dump all collectives + ncclDumpMap
|
||||||
std::string dump(
|
std::string dump(
|
||||||
const std::optional<std::unordered_map<
|
const std::optional<std::unordered_map<
|
||||||
|
Reference in New Issue
Block a user