mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[Distributed] Do not expose nlohmann/json.hpp
in public headers (#131925)
Move `<hlohmann/json.hpp>` dependency as well as `NCCLTraceBuffer::getCollectiveTraceJson` and `NCCLTraceBuffer::dump_json` implementation introduced by https://github.com/pytorch/pytorch/pull/129505 from the header into .cpp file. This relaxes the requirement on all downstream client to depend on the library Fixes https://github.com/pytorch/pytorch/issues/130678 Pull Request resolved: https://github.com/pytorch/pytorch/pull/131925 Approved by: https://github.com/albanD, https://github.com/d4l3k, https://github.com/fduwjj, https://github.com/c-p-i-o ghstack dependencies: #131922
This commit is contained in:
committed by
PyTorch MergeBot
parent
75c8d59ea1
commit
f901b02066
@ -5,7 +5,6 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <nlohmann/json.hpp>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
@ -174,7 +173,6 @@
|
||||
} while (0)
|
||||
|
||||
namespace c10d {
|
||||
using json = nlohmann::json;
|
||||
#define DEFINE_CONSTANT(name, value) \
|
||||
static c10::IValue name = value; \
|
||||
static std::string name##_str = value;
|
||||
@ -820,73 +818,6 @@ struct NCCLTraceBuffer {
|
||||
}
|
||||
}
|
||||
|
||||
std::list<json> getCollectiveTraceJson(bool onlyActive) {
|
||||
std::list<json> entries;
|
||||
for (auto& e : dump_entries()) {
|
||||
if (onlyActive && e.time_discovered_completed_.has_value()) {
|
||||
continue;
|
||||
}
|
||||
json j;
|
||||
j[record_id_key_str] = int64_t(e.id_);
|
||||
j[pg_id_key_str] = int64_t(e.pg_id_);
|
||||
j[pg_name_key_str] = e.pg_name_;
|
||||
j[collective_seq_id_key_str] = int64_t(e.collective_seq_id_);
|
||||
j[p2p_seq_id_key_str] = int64_t(e.p2p_seq_id_);
|
||||
j[op_id_key_str] = int64_t(e.op_id_);
|
||||
j[profiling_name_key_str] = e.profiling_name_;
|
||||
j[time_created_key_str] = int64_t(e.time_created_);
|
||||
if (e.duration_) {
|
||||
j[duration_key_str] = *e.duration_;
|
||||
}
|
||||
auto it = e.sizes_.begin();
|
||||
auto read_sizes = [&](const c10::SmallVector<int, 4>& dims) {
|
||||
auto sizes = std::list<std::list<int>>();
|
||||
for (auto dim : dims) {
|
||||
auto arg_sizes = std::list<int>();
|
||||
for (C10_UNUSED auto i : c10::irange(dim)) {
|
||||
arg_sizes.push_back(*it++);
|
||||
}
|
||||
sizes.push_back(arg_sizes);
|
||||
}
|
||||
return sizes;
|
||||
};
|
||||
j[input_sizes_key_str] = read_sizes(e.input_dims_);
|
||||
std::vector<std::string> input_dtypes_strs;
|
||||
input_dtypes_strs.reserve(e.input_dtypes_.size());
|
||||
for (const auto& input_dtype : e.input_dtypes_) {
|
||||
input_dtypes_strs.push_back(c10::toString(input_dtype));
|
||||
}
|
||||
j[input_dtypes_key_str] = input_dtypes_strs;
|
||||
j[output_sizes_key_str] = read_sizes(e.output_dims_);
|
||||
std::vector<std::string> output_dtypes_strs;
|
||||
output_dtypes_strs.reserve(e.output_dtypes_.size());
|
||||
for (const auto& output_dtype : e.output_dtypes_) {
|
||||
output_dtypes_strs.push_back(c10::toString(output_dtype));
|
||||
}
|
||||
j[output_dtypes_key_str] = output_dtypes_strs;
|
||||
if (e.time_discovered_completed_.has_value()) {
|
||||
j[state_key_str] = completed_state_str;
|
||||
} else if (e.time_discovered_started_.has_value()) {
|
||||
j[state_key_str] = started_state_str;
|
||||
} else {
|
||||
j[state_key_str] = scheduled_state_str;
|
||||
}
|
||||
j[time_discovered_started_key_str] =
|
||||
e.time_discovered_started_.has_value()
|
||||
? int64_t(*e.time_discovered_started_)
|
||||
: 0;
|
||||
j[time_discovered_completed_key_str] =
|
||||
e.time_discovered_completed_.has_value()
|
||||
? int64_t(*e.time_discovered_completed_)
|
||||
: 0;
|
||||
j[retired_key_str] = e.retired_;
|
||||
j[timeout_key_str] = e.timeout_ms_;
|
||||
j[is_p2p_key_str] = e.isP2P_;
|
||||
entries.emplace_back(j);
|
||||
}
|
||||
return entries;
|
||||
}
|
||||
|
||||
const c10::List<c10::IValue> getCollectiveTrace(
|
||||
bool includeStacktraces,
|
||||
bool onlyActive) {
|
||||
@ -941,8 +872,7 @@ struct NCCLTraceBuffer {
|
||||
auto sizes = new_list();
|
||||
for (auto dim : dims) {
|
||||
auto arg_sizes = new_list();
|
||||
for (auto i : c10::irange(dim)) {
|
||||
(void)i;
|
||||
for (C10_UNUSED auto i : c10::irange(dim)) {
|
||||
arg_sizes.push_back(*it++);
|
||||
}
|
||||
sizes.push_back(arg_sizes);
|
||||
@ -1051,27 +981,7 @@ struct NCCLTraceBuffer {
|
||||
std::string,
|
||||
std::unordered_map<std::string, std::string>>>& ncclDumpMap,
|
||||
bool includeCollectives,
|
||||
bool onlyActive) {
|
||||
json result;
|
||||
result[version_key_str] = version_val_str;
|
||||
result[pg_config_key_str] = getPgConfigJson();
|
||||
result[pg_status_key_str] = getPgStatusJson();
|
||||
|
||||
// collective trace
|
||||
if (includeCollectives) {
|
||||
auto entries = getCollectiveTraceJson(onlyActive);
|
||||
if (entries.size() > 0) {
|
||||
result[entries_key_str] = entries;
|
||||
}
|
||||
}
|
||||
|
||||
if (ncclDumpMap.has_value()) {
|
||||
result[nccl_comm_key_str] = ncclDumpMap.value();
|
||||
}
|
||||
|
||||
return result.dump();
|
||||
}
|
||||
|
||||
bool onlyActive);
|
||||
// dump all collectives + ncclDumpMap
|
||||
std::string dump(
|
||||
const std::optional<std::unordered_map<
|
||||
|
Reference in New Issue
Block a user