[Distributed] Do not expose nlohmann/json.hpp in public headers (#131925)

Move `<hlohmann/json.hpp>` dependency as well as `NCCLTraceBuffer::getCollectiveTraceJson` and `NCCLTraceBuffer::dump_json` implementation introduced by https://github.com/pytorch/pytorch/pull/129505 from the header into .cpp file. This relaxes the requirement on all downstream client to depend on the library

Fixes https://github.com/pytorch/pytorch/issues/130678

Pull Request resolved: https://github.com/pytorch/pytorch/pull/131925
Approved by: https://github.com/albanD, https://github.com/d4l3k, https://github.com/fduwjj, https://github.com/c-p-i-o
ghstack dependencies: #131922
This commit is contained in:
Nikita Shulga
2024-07-26 21:58:34 +00:00
committed by PyTorch MergeBot
parent 75c8d59ea1
commit f901b02066
2 changed files with 95 additions and 92 deletions

View File

@ -5,7 +5,6 @@
#include <stdio.h>
#include <stdlib.h>
#include <nlohmann/json.hpp>
#include <memory>
#include <mutex>
#include <thread>
@ -174,7 +173,6 @@
} while (0)
namespace c10d {
using json = nlohmann::json;
#define DEFINE_CONSTANT(name, value) \
static c10::IValue name = value; \
static std::string name##_str = value;
@ -820,73 +818,6 @@ struct NCCLTraceBuffer {
}
}
std::list<json> getCollectiveTraceJson(bool onlyActive) {
std::list<json> entries;
for (auto& e : dump_entries()) {
if (onlyActive && e.time_discovered_completed_.has_value()) {
continue;
}
json j;
j[record_id_key_str] = int64_t(e.id_);
j[pg_id_key_str] = int64_t(e.pg_id_);
j[pg_name_key_str] = e.pg_name_;
j[collective_seq_id_key_str] = int64_t(e.collective_seq_id_);
j[p2p_seq_id_key_str] = int64_t(e.p2p_seq_id_);
j[op_id_key_str] = int64_t(e.op_id_);
j[profiling_name_key_str] = e.profiling_name_;
j[time_created_key_str] = int64_t(e.time_created_);
if (e.duration_) {
j[duration_key_str] = *e.duration_;
}
auto it = e.sizes_.begin();
auto read_sizes = [&](const c10::SmallVector<int, 4>& dims) {
auto sizes = std::list<std::list<int>>();
for (auto dim : dims) {
auto arg_sizes = std::list<int>();
for (C10_UNUSED auto i : c10::irange(dim)) {
arg_sizes.push_back(*it++);
}
sizes.push_back(arg_sizes);
}
return sizes;
};
j[input_sizes_key_str] = read_sizes(e.input_dims_);
std::vector<std::string> input_dtypes_strs;
input_dtypes_strs.reserve(e.input_dtypes_.size());
for (const auto& input_dtype : e.input_dtypes_) {
input_dtypes_strs.push_back(c10::toString(input_dtype));
}
j[input_dtypes_key_str] = input_dtypes_strs;
j[output_sizes_key_str] = read_sizes(e.output_dims_);
std::vector<std::string> output_dtypes_strs;
output_dtypes_strs.reserve(e.output_dtypes_.size());
for (const auto& output_dtype : e.output_dtypes_) {
output_dtypes_strs.push_back(c10::toString(output_dtype));
}
j[output_dtypes_key_str] = output_dtypes_strs;
if (e.time_discovered_completed_.has_value()) {
j[state_key_str] = completed_state_str;
} else if (e.time_discovered_started_.has_value()) {
j[state_key_str] = started_state_str;
} else {
j[state_key_str] = scheduled_state_str;
}
j[time_discovered_started_key_str] =
e.time_discovered_started_.has_value()
? int64_t(*e.time_discovered_started_)
: 0;
j[time_discovered_completed_key_str] =
e.time_discovered_completed_.has_value()
? int64_t(*e.time_discovered_completed_)
: 0;
j[retired_key_str] = e.retired_;
j[timeout_key_str] = e.timeout_ms_;
j[is_p2p_key_str] = e.isP2P_;
entries.emplace_back(j);
}
return entries;
}
const c10::List<c10::IValue> getCollectiveTrace(
bool includeStacktraces,
bool onlyActive) {
@ -941,8 +872,7 @@ struct NCCLTraceBuffer {
auto sizes = new_list();
for (auto dim : dims) {
auto arg_sizes = new_list();
for (auto i : c10::irange(dim)) {
(void)i;
for (C10_UNUSED auto i : c10::irange(dim)) {
arg_sizes.push_back(*it++);
}
sizes.push_back(arg_sizes);
@ -1051,27 +981,7 @@ struct NCCLTraceBuffer {
std::string,
std::unordered_map<std::string, std::string>>>& ncclDumpMap,
bool includeCollectives,
bool onlyActive) {
json result;
result[version_key_str] = version_val_str;
result[pg_config_key_str] = getPgConfigJson();
result[pg_status_key_str] = getPgStatusJson();
// collective trace
if (includeCollectives) {
auto entries = getCollectiveTraceJson(onlyActive);
if (entries.size() > 0) {
result[entries_key_str] = entries;
}
}
if (ncclDumpMap.has_value()) {
result[nccl_comm_key_str] = ncclDumpMap.value();
}
return result.dump();
}
bool onlyActive);
// dump all collectives + ncclDumpMap
std::string dump(
const std::optional<std::unordered_map<