mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Back out "Revert D16469619: Add Virtual Memory and CPU percentage computation to AIBench"
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/23821 Reviewed By: hl475 Differential Revision: D16654854 fbshipit-source-id: f057023e890cbcbd9145ef2ecb449df2fbba592b
This commit is contained in:
committed by
Facebook Github Bot
parent
e90adf59a0
commit
e23e4cc356
@ -50,6 +50,11 @@ C10_DEFINE_string(
|
||||
"Input type when specifying the input dimension."
|
||||
"The supported types are float, uint8_t.");
|
||||
C10_DEFINE_int(iter, 10, "The number of iterations to run.");
|
||||
C10_DEFINE_bool(
|
||||
measure_memory,
|
||||
false,
|
||||
"Whether to measure increase in allocated memory while "
|
||||
"loading and running the net.");
|
||||
C10_DEFINE_string(net, "", "The given net to benchmark.");
|
||||
C10_DEFINE_string(
|
||||
output,
|
||||
|
@ -35,6 +35,14 @@
|
||||
#include <observers/observer_config.h>
|
||||
#include <observers/perf_observer.h>
|
||||
|
||||
#if defined(TARGET_OS_MAC) || \
|
||||
defined(TARGET_OS_IPHONE) || \
|
||||
defined(TARGET_IPHONE_SIMULATOR)
|
||||
#include <malloc/malloc.h>
|
||||
#else
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
|
||||
using std::map;
|
||||
using std::shared_ptr;
|
||||
using std::string;
|
||||
@ -235,7 +243,7 @@ void fillInputBlob(
|
||||
|
||||
void runNetwork(
|
||||
shared_ptr<caffe2::Workspace> workspace,
|
||||
caffe2::NetDef& net_def,
|
||||
caffe2::NetBase* net,
|
||||
map<string, caffe2::TensorProtos>& tensor_protos_map,
|
||||
const bool wipe_cache,
|
||||
const bool run_individual,
|
||||
@ -250,13 +258,6 @@ void runNetwork(
|
||||
const std::string& output,
|
||||
const std::string& output_folder) {
|
||||
|
||||
if (!net_def.has_name()) {
|
||||
net_def.set_name("benchmark");
|
||||
}
|
||||
|
||||
caffe2::NetBase* net = workspace->CreateNet(net_def);
|
||||
CHECK_NOTNULL(net);
|
||||
|
||||
LOG(INFO) << "Starting benchmark.";
|
||||
caffe2::ObserverConfig::initSampleRate(1, 1, 1, run_individual, warmup);
|
||||
LOG(INFO) << "Running warmup runs.";
|
||||
@ -376,6 +377,35 @@ void writeOutput(
|
||||
}
|
||||
}
|
||||
|
||||
void logBenchmarkResult(
|
||||
const std::string& type,
|
||||
const std::string& metric,
|
||||
const std::string& unit,
|
||||
const int value) {
|
||||
LOG(INFO) << caffe2::NetObserverReporterPrint::IDENTIFIER << "{"
|
||||
<< "\"type\": \"" << type << "\", "
|
||||
<< "\"metric\": \"" << metric << "\", "
|
||||
<< "\"unit\": \"" << unit << "\", "
|
||||
<< "\"value\": " << c10::to_string(value) << "}\n";
|
||||
}
|
||||
|
||||
long getVirtualMemoryIfOptionEnabled(bool FLAGS_measure_memory) {
|
||||
if (FLAGS_measure_memory) {
|
||||
#if defined(TARGET_OS_IPHONE) || \
|
||||
defined(TARGET_OS_MAC) || \
|
||||
defined(TARGET_IPHONE_SIMULATOR)
|
||||
malloc_statistics_t stats = {0};
|
||||
malloc_zone_statistics(nullptr, &stats);
|
||||
return stats.size_allocated;
|
||||
#else
|
||||
struct mallinfo info = mallinfo();
|
||||
return info.uordblks;
|
||||
#endif
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int benchmark(
|
||||
int argc,
|
||||
char* argv[],
|
||||
@ -386,6 +416,7 @@ int benchmark(
|
||||
const string& FLAGS_input_file,
|
||||
const string& FLAGS_input_type,
|
||||
int FLAGS_iter,
|
||||
bool FLAGS_measure_memory,
|
||||
const string& FLAGS_net,
|
||||
const string& FLAGS_output,
|
||||
const string& FLAGS_output_folder,
|
||||
@ -423,19 +454,15 @@ int benchmark(
|
||||
|
||||
auto workspace = std::make_shared<caffe2::Workspace>(new caffe2::Workspace());
|
||||
bool run_on_gpu = backendCudaSet(FLAGS_backend);
|
||||
// Run initialization network.
|
||||
// Run initialization network, measure resources used.
|
||||
long init_vmem = getVirtualMemoryIfOptionEnabled(FLAGS_measure_memory);
|
||||
caffe2::NetDef init_net_def;
|
||||
CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_init_net, &init_net_def));
|
||||
setOperatorEngine(&init_net_def, FLAGS_backend);
|
||||
CAFFE_ENFORCE(workspace->RunNetOnce(init_net_def));
|
||||
|
||||
// Run main network.
|
||||
caffe2::NetDef net_def;
|
||||
CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_net, &net_def));
|
||||
setOperatorEngine(&net_def, FLAGS_backend);
|
||||
init_vmem = getVirtualMemoryIfOptionEnabled(FLAGS_measure_memory) - init_vmem;
|
||||
|
||||
map<string, caffe2::TensorProtos> tensor_protos_map;
|
||||
|
||||
int num_blobs = loadInput(
|
||||
workspace,
|
||||
run_on_gpu,
|
||||
@ -445,9 +472,19 @@ int benchmark(
|
||||
FLAGS_input_dims,
|
||||
FLAGS_input_type);
|
||||
|
||||
// Run main network.
|
||||
long predict_vmem = getVirtualMemoryIfOptionEnabled(FLAGS_measure_memory);
|
||||
caffe2::NetDef net_def;
|
||||
CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_net, &net_def));
|
||||
setOperatorEngine(&net_def, FLAGS_backend);
|
||||
if (!net_def.has_name()) {
|
||||
net_def.set_name("benchmark");
|
||||
}
|
||||
caffe2::NetBase* net = workspace->CreateNet(net_def);
|
||||
CHECK_NOTNULL(net);
|
||||
runNetwork(
|
||||
workspace,
|
||||
net_def,
|
||||
net,
|
||||
tensor_protos_map,
|
||||
FLAGS_wipe_cache,
|
||||
FLAGS_run_individual,
|
||||
@ -461,6 +498,12 @@ int benchmark(
|
||||
FLAGS_sleep_between_net_and_operator,
|
||||
FLAGS_output,
|
||||
FLAGS_output_folder);
|
||||
predict_vmem = getVirtualMemoryIfOptionEnabled(
|
||||
FLAGS_measure_memory) - predict_vmem;
|
||||
if (FLAGS_measure_memory) {
|
||||
logBenchmarkResult(
|
||||
"NET_", "memory", "kB", (init_vmem + predict_vmem) / 1024);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -125,7 +125,7 @@ void writeOutput(
|
||||
const int num_blobs);
|
||||
void runNetwork(
|
||||
shared_ptr<caffe2::Workspace> workspace,
|
||||
caffe2::NetDef& net_def,
|
||||
caffe2::NetBase* net,
|
||||
map<string, caffe2::TensorProtos>& tensor_protos_map,
|
||||
const bool wipe_cache,
|
||||
const bool run_individual,
|
||||
@ -149,6 +149,7 @@ int benchmark(
|
||||
const string& FLAGS_input_file,
|
||||
const string& FLAGS_input_type,
|
||||
int FLAGS_iter,
|
||||
bool FLAGS_measure_memory,
|
||||
const string& FLAGS_net,
|
||||
const string& FLAGS_output,
|
||||
const string& FLAGS_output_folder,
|
||||
|
@ -22,6 +22,7 @@ int main(int argc, char** argv) {
|
||||
FLAGS_input_file,
|
||||
FLAGS_input_type,
|
||||
FLAGS_iter,
|
||||
FLAGS_measure_memory,
|
||||
FLAGS_net,
|
||||
FLAGS_output,
|
||||
FLAGS_output_folder,
|
||||
|
@ -19,6 +19,7 @@ struct PerformanceInformation {
|
||||
std::string type = ""; // the type of the operator
|
||||
// Measured
|
||||
double latency = 0;
|
||||
double cpuMilliseconds = 0;
|
||||
};
|
||||
|
||||
class CAFFE2_OBSERVER_API NetObserverReporter {
|
||||
|
@ -1,5 +1,6 @@
|
||||
#include "observers/net_observer_reporter_print.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <sstream>
|
||||
#include "caffe2/core/init.h"
|
||||
#include "observers/observer_config.h"
|
||||
@ -9,6 +10,7 @@ namespace caffe2 {
|
||||
const std::string NetObserverReporterPrint::IDENTIFIER = "Caffe2Observer ";
|
||||
static std::string get_op_args(PerformanceInformation p);
|
||||
static std::string get_tensor_shapes(PerformanceInformation p);
|
||||
static std::string sanatize(std::string json_s);
|
||||
|
||||
void NetObserverReporterPrint::report(
|
||||
NetBase* net,
|
||||
@ -23,29 +25,49 @@ void NetObserverReporterPrint::report(
|
||||
{"value", c10::to_string(p.second.latency * 1000)},
|
||||
{"unit", "us"},
|
||||
{"metric", "latency"}});
|
||||
caffe2_perf.push_back({{"type", "NET_"},
|
||||
{
|
||||
"value",
|
||||
c10::to_string(
|
||||
p.second.cpuMilliseconds /
|
||||
p.second.latency *
|
||||
100),
|
||||
},
|
||||
{"unit", "percent"},
|
||||
{"metric", "cpu_percent"}});
|
||||
} else if (p.first != "NET_DELAY") {
|
||||
// for operator perf
|
||||
std::string shape_str = get_tensor_shapes(p.second);
|
||||
std::string args_str = get_op_args(p.second);
|
||||
|
||||
caffe2_perf.push_back({{"type", p.first},
|
||||
std::string type = p.first;
|
||||
caffe2_perf.push_back({{"type", type},
|
||||
{"value", c10::to_string(p.second.latency * 1000)},
|
||||
{"unit", "us"},
|
||||
{"metric", "latency"}});
|
||||
caffe2_perf.push_back({{"type", type},
|
||||
{
|
||||
"value",
|
||||
c10::to_string(
|
||||
p.second.cpuMilliseconds /
|
||||
p.second.latency *
|
||||
100),
|
||||
},
|
||||
{"unit", "percent"},
|
||||
{"metric", "cpu_percent"}});
|
||||
if (p.second.flops > 0) {
|
||||
caffe2_perf.push_back({{"type", p.first},
|
||||
caffe2_perf.push_back({{"type", type},
|
||||
{"value", c10::to_string(p.second.flops)},
|
||||
{"unit", "flop"},
|
||||
{"metric", "flops"}});
|
||||
}
|
||||
if (shape_str != "") {
|
||||
caffe2_perf.push_back({{"type", p.first},
|
||||
caffe2_perf.push_back({{"type", type},
|
||||
{"info_string", shape_str},
|
||||
{"unit", ""},
|
||||
{"metric", "tensor_shapes"}});
|
||||
}
|
||||
if (args_str != "") {
|
||||
caffe2_perf.push_back({{"type", p.first},
|
||||
caffe2_perf.push_back({{"type", type},
|
||||
{"info_string", args_str},
|
||||
{"unit", ""},
|
||||
{"metric", "op_args"}});
|
||||
@ -57,13 +79,13 @@ void NetObserverReporterPrint::report(
|
||||
std::stringstream buffer;
|
||||
auto entry = *it;
|
||||
buffer << IDENTIFIER << "{";
|
||||
buffer << "\"type\": \"" << entry["type"] << "\","
|
||||
<< "\"unit\": \"" << entry["unit"] << "\","
|
||||
<< "\"metric\": \"" << entry["metric"] << "\",";
|
||||
buffer << "\"type\": \"" << sanatize(entry["type"]) << "\","
|
||||
<< "\"unit\": \"" << sanatize(entry["unit"]) << "\","
|
||||
<< "\"metric\": \"" << sanatize(entry["metric"]) << "\",";
|
||||
if (entry.find("value") != entry.end()) {
|
||||
buffer << "\"value\": \"" << entry["value"] << "\"";
|
||||
buffer << "\"value\": \"" << sanatize(entry["value"]) << "\"";
|
||||
} else if (entry.find("info_string") != entry.end()) {
|
||||
buffer << "\"info_string\": \"" << entry["info_string"] << "\"";
|
||||
buffer << "\"info_string\": \"" << sanatize(entry["info_string"]) << "\"";
|
||||
}
|
||||
buffer << "}";
|
||||
LOG(INFO) << buffer.str();
|
||||
@ -117,4 +139,12 @@ static std::string get_op_args(PerformanceInformation p) {
|
||||
}
|
||||
return args_str;
|
||||
}
|
||||
|
||||
static std::string sanatize(std::string json_s) {
|
||||
// Remove illegal characters from the name that would cause json string to
|
||||
// become invalid
|
||||
json_s.erase(std::remove(json_s.begin(), json_s.end(), '"'), json_s.end());
|
||||
json_s.erase(std::remove(json_s.begin(), json_s.end(), '\\'), json_s.end());
|
||||
return json_s;
|
||||
}
|
||||
}
|
||||
|
@ -6,6 +6,10 @@
|
||||
#endif
|
||||
|
||||
#include <random>
|
||||
#include <time.h>
|
||||
#ifdef _WIN32
|
||||
#include <windows.h>
|
||||
#endif
|
||||
#include "caffe2/core/common.h"
|
||||
#include "caffe2/core/init.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
@ -62,6 +66,75 @@ bool registerGlobalPerfNetObserverCreator(int* /*pargc*/, char*** /*pargv*/) {
|
||||
}
|
||||
} // namespace
|
||||
|
||||
#ifdef _WIN32
|
||||
double getTicksPerMillisecond() {
|
||||
static LARGE_INTEGER ticks_per_sec;
|
||||
if (!ticks_per_sec.QuadPart) {
|
||||
QueryPerformanceFrequency(&ticks_per_sec);
|
||||
if (!ticks_per_sec.QuadPart) {
|
||||
return 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
return static_cast<double>(ticks_per_sec.QuadPart) / 1000.0;
|
||||
}
|
||||
#else
|
||||
double getClockTimeMilliseconds(clockid_t clk_id) {
|
||||
int result;
|
||||
struct timespec tp;
|
||||
result = clock_gettime(clk_id, &tp);
|
||||
if (result == -1) {
|
||||
return 0.0;
|
||||
} else {
|
||||
return tp.tv_sec * 1000.0 + tp.tv_nsec / 1000000.0;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
double getWallClockTimeMilliseconds() {
|
||||
#ifdef _WIN32
|
||||
double ticks_per_ms = getTicksPerMillisecond();
|
||||
if (ticks_per_ms) {
|
||||
LARGE_INTEGER ticks;
|
||||
if (QueryPerformanceCounter(&ticks)) {
|
||||
return static_cast<double>(ticks.QuadPart) / ticks_per_ms;
|
||||
}
|
||||
}
|
||||
|
||||
return 0.0;
|
||||
#else
|
||||
return getClockTimeMilliseconds(CLOCK_MONOTONIC);
|
||||
#endif
|
||||
}
|
||||
|
||||
double getCpuTimeMilliseconds() {
|
||||
#ifdef _WIN32
|
||||
FILETIME creation_time;
|
||||
FILETIME exit_time;
|
||||
FILETIME kernel_time;
|
||||
FILETIME user_time;
|
||||
if (GetProcessTimes(
|
||||
GetCurrentProcess(),
|
||||
&creation_time,
|
||||
&exit_time,
|
||||
&kernel_time,
|
||||
&user_time)) {
|
||||
ULARGE_INTEGER kernel;
|
||||
ULARGE_INTEGER user;
|
||||
kernel.HighPart = kernel_time.dwHighDateTime;
|
||||
kernel.LowPart = kernel_time.dwLowDateTime;
|
||||
user.HighPart = user_time.dwHighDateTime;
|
||||
user.LowPart = user_time.dwLowDateTime;
|
||||
return (static_cast<double>(kernel.QuadPart) +
|
||||
static_cast<double>(user.QuadPart)) / 10000.0;
|
||||
}
|
||||
|
||||
return 0.0;
|
||||
#else
|
||||
return getClockTimeMilliseconds(CLOCK_PROCESS_CPUTIME_ID);
|
||||
#endif
|
||||
}
|
||||
|
||||
REGISTER_CAFFE2_EARLY_INIT_FUNCTION(
|
||||
registerGlobalPerfNetObserverCreator,
|
||||
®isterGlobalPerfNetObserverCreator,
|
||||
@ -107,30 +180,31 @@ void PerfNetObserver::Start() {
|
||||
}
|
||||
}
|
||||
|
||||
if (logType_ != PerfNetObserver::NONE) {
|
||||
/* Only start timer when we need to */
|
||||
timer_.Start();
|
||||
}
|
||||
wallMilliseconds_ = getWallClockTimeMilliseconds();
|
||||
cpuMilliseconds_ = getCpuTimeMilliseconds();
|
||||
}
|
||||
|
||||
void PerfNetObserver::Stop() {
|
||||
if (logType_ == PerfNetObserver::NONE) {
|
||||
return;
|
||||
}
|
||||
auto currentRunTime = timer_.MilliSeconds();
|
||||
std::map<std::string, PerformanceInformation> info;
|
||||
PerformanceInformation net_perf;
|
||||
net_perf.latency = currentRunTime;
|
||||
net_perf.cpuMilliseconds =
|
||||
getCpuTimeMilliseconds() - cpuMilliseconds_;
|
||||
net_perf.latency =
|
||||
getWallClockTimeMilliseconds() - wallMilliseconds_;
|
||||
|
||||
if (logType_ == PerfNetObserver::OPERATOR_DELAY) {
|
||||
const auto& operators = subject_->GetOperators();
|
||||
for (int idx = 0; idx < operators.size(); ++idx) {
|
||||
const auto* op = operators[idx];
|
||||
auto name = getObserverName(op, idx);
|
||||
PerformanceInformation p;
|
||||
|
||||
p.latency = static_cast<const PerfOperatorObserver*>(observerMap_[op])
|
||||
->getMilliseconds();
|
||||
|
||||
const PerfOperatorObserver* opObserver =
|
||||
static_cast<const PerfOperatorObserver*>(observerMap_[op]);
|
||||
p.latency = opObserver->getWallMilliseconds();
|
||||
p.cpuMilliseconds = opObserver->getCpuMilliseconds();
|
||||
p.engine = op->engine();
|
||||
p.type = op->type();
|
||||
p.tensor_shapes =
|
||||
@ -176,30 +250,34 @@ PerfOperatorObserver::PerfOperatorObserver(
|
||||
PerfNetObserver* netObserver)
|
||||
: ObserverBase<OperatorBase>(op),
|
||||
netObserver_(netObserver),
|
||||
milliseconds_(0) {
|
||||
wallMilliseconds_(0),
|
||||
cpuMilliseconds_(0) {
|
||||
CAFFE_ENFORCE(netObserver_, "Observers can't operate outside of the net");
|
||||
}
|
||||
|
||||
PerfOperatorObserver::~PerfOperatorObserver() {}
|
||||
|
||||
void PerfOperatorObserver::Start() {
|
||||
/* Get the time from the start of the net minus the time spent
|
||||
in previous invocations. It is the time spent on other operators.
|
||||
This way, when the operator finishes, the time from the start of the net
|
||||
minus the time spent in all other operators is the total time on this
|
||||
operator. This is done to avoid saving a timer in each operator */
|
||||
milliseconds_ = netObserver_->getTimer().MilliSeconds() - milliseconds_;
|
||||
wallMilliseconds_ = getWallClockTimeMilliseconds();
|
||||
cpuMilliseconds_ = getCpuTimeMilliseconds();
|
||||
}
|
||||
|
||||
void PerfOperatorObserver::Stop() {
|
||||
/* Time from the start of the net minus the time spent on all other
|
||||
operators is the time spent on this operator */
|
||||
milliseconds_ = netObserver_->getTimer().MilliSeconds() - milliseconds_;
|
||||
cpuMilliseconds_ =
|
||||
getCpuTimeMilliseconds() - cpuMilliseconds_;
|
||||
wallMilliseconds_ =
|
||||
getWallClockTimeMilliseconds() - wallMilliseconds_;
|
||||
tensor_shapes_ = subject_->InputTensorShapes();
|
||||
}
|
||||
|
||||
double PerfOperatorObserver::getMilliseconds() const {
|
||||
return milliseconds_;
|
||||
double PerfOperatorObserver::getWallMilliseconds() const {
|
||||
return wallMilliseconds_;
|
||||
}
|
||||
|
||||
double PerfOperatorObserver::getCpuMilliseconds() const {
|
||||
return cpuMilliseconds_;
|
||||
}
|
||||
|
||||
std::vector<TensorShape> PerfOperatorObserver::getTensorShapes() const {
|
||||
|
@ -10,16 +10,13 @@
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
double getClockTimeMilliseconds();
|
||||
|
||||
class CAFFE2_OBSERVER_API PerfNetObserver : public NetObserver {
|
||||
public:
|
||||
explicit PerfNetObserver(NetBase* subject_);
|
||||
virtual ~PerfNetObserver();
|
||||
|
||||
caffe2::Timer& getTimer() {
|
||||
return timer_;
|
||||
}
|
||||
|
||||
private:
|
||||
void Start() override;
|
||||
void Stop() override;
|
||||
@ -37,7 +34,8 @@ class CAFFE2_OBSERVER_API PerfNetObserver : public NetObserver {
|
||||
std::unordered_map<const OperatorBase*, const ObserverBase<OperatorBase>*>
|
||||
observerMap_;
|
||||
|
||||
caffe2::Timer timer_;
|
||||
double wallMilliseconds_;
|
||||
double cpuMilliseconds_;
|
||||
};
|
||||
|
||||
class PerfOperatorObserver : public ObserverBase<OperatorBase> {
|
||||
@ -45,7 +43,8 @@ class PerfOperatorObserver : public ObserverBase<OperatorBase> {
|
||||
PerfOperatorObserver(OperatorBase* op, PerfNetObserver* netObserver);
|
||||
virtual ~PerfOperatorObserver();
|
||||
|
||||
double getMilliseconds() const;
|
||||
double getWallMilliseconds() const;
|
||||
double getCpuMilliseconds() const;
|
||||
std::vector<TensorShape> getTensorShapes() const;
|
||||
|
||||
private:
|
||||
@ -60,7 +59,8 @@ class PerfOperatorObserver : public ObserverBase<OperatorBase> {
|
||||
// without storing inside the operator observer. Each field is memory
|
||||
// costly here and a raw pointer is a cheapest sholution
|
||||
PerfNetObserver* netObserver_;
|
||||
double milliseconds_;
|
||||
double wallMilliseconds_;
|
||||
double cpuMilliseconds_;
|
||||
std::vector<TensorShape> tensor_shapes_;
|
||||
};
|
||||
} // namespace caffe2
|
||||
|
Reference in New Issue
Block a user