Back out "Revert D16469619: Add Virtual Memory and CPU percentage computation to AIBench"

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/23821

Reviewed By: hl475

Differential Revision: D16654854

fbshipit-source-id: f057023e890cbcbd9145ef2ecb449df2fbba592b
This commit is contained in:
Geoffrey Goh
2019-08-07 15:32:18 -07:00
committed by Facebook Github Bot
parent e90adf59a0
commit e23e4cc356
8 changed files with 213 additions and 54 deletions

View File

@ -50,6 +50,11 @@ C10_DEFINE_string(
"Input type when specifying the input dimension."
"The supported types are float, uint8_t.");
C10_DEFINE_int(iter, 10, "The number of iterations to run.");
C10_DEFINE_bool(
measure_memory,
false,
"Whether to measure increase in allocated memory while "
"loading and running the net.");
C10_DEFINE_string(net, "", "The given net to benchmark.");
C10_DEFINE_string(
output,

View File

@ -35,6 +35,14 @@
#include <observers/observer_config.h>
#include <observers/perf_observer.h>
#if defined(TARGET_OS_MAC) || \
defined(TARGET_OS_IPHONE) || \
defined(TARGET_IPHONE_SIMULATOR)
#include <malloc/malloc.h>
#else
#include <malloc.h>
#endif
using std::map;
using std::shared_ptr;
using std::string;
@ -235,7 +243,7 @@ void fillInputBlob(
void runNetwork(
shared_ptr<caffe2::Workspace> workspace,
caffe2::NetDef& net_def,
caffe2::NetBase* net,
map<string, caffe2::TensorProtos>& tensor_protos_map,
const bool wipe_cache,
const bool run_individual,
@ -250,13 +258,6 @@ void runNetwork(
const std::string& output,
const std::string& output_folder) {
if (!net_def.has_name()) {
net_def.set_name("benchmark");
}
caffe2::NetBase* net = workspace->CreateNet(net_def);
CHECK_NOTNULL(net);
LOG(INFO) << "Starting benchmark.";
caffe2::ObserverConfig::initSampleRate(1, 1, 1, run_individual, warmup);
LOG(INFO) << "Running warmup runs.";
@ -376,6 +377,35 @@ void writeOutput(
}
}
void logBenchmarkResult(
const std::string& type,
const std::string& metric,
const std::string& unit,
const int value) {
LOG(INFO) << caffe2::NetObserverReporterPrint::IDENTIFIER << "{"
<< "\"type\": \"" << type << "\", "
<< "\"metric\": \"" << metric << "\", "
<< "\"unit\": \"" << unit << "\", "
<< "\"value\": " << c10::to_string(value) << "}\n";
}
long getVirtualMemoryIfOptionEnabled(bool FLAGS_measure_memory) {
if (FLAGS_measure_memory) {
#if defined(TARGET_OS_IPHONE) || \
defined(TARGET_OS_MAC) || \
defined(TARGET_IPHONE_SIMULATOR)
malloc_statistics_t stats = {0};
malloc_zone_statistics(nullptr, &stats);
return stats.size_allocated;
#else
struct mallinfo info = mallinfo();
return info.uordblks;
#endif
}
return 0;
}
int benchmark(
int argc,
char* argv[],
@ -386,6 +416,7 @@ int benchmark(
const string& FLAGS_input_file,
const string& FLAGS_input_type,
int FLAGS_iter,
bool FLAGS_measure_memory,
const string& FLAGS_net,
const string& FLAGS_output,
const string& FLAGS_output_folder,
@ -423,19 +454,15 @@ int benchmark(
auto workspace = std::make_shared<caffe2::Workspace>(new caffe2::Workspace());
bool run_on_gpu = backendCudaSet(FLAGS_backend);
// Run initialization network.
// Run initialization network, measure resources used.
long init_vmem = getVirtualMemoryIfOptionEnabled(FLAGS_measure_memory);
caffe2::NetDef init_net_def;
CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_init_net, &init_net_def));
setOperatorEngine(&init_net_def, FLAGS_backend);
CAFFE_ENFORCE(workspace->RunNetOnce(init_net_def));
// Run main network.
caffe2::NetDef net_def;
CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_net, &net_def));
setOperatorEngine(&net_def, FLAGS_backend);
init_vmem = getVirtualMemoryIfOptionEnabled(FLAGS_measure_memory) - init_vmem;
map<string, caffe2::TensorProtos> tensor_protos_map;
int num_blobs = loadInput(
workspace,
run_on_gpu,
@ -445,9 +472,19 @@ int benchmark(
FLAGS_input_dims,
FLAGS_input_type);
// Run main network.
long predict_vmem = getVirtualMemoryIfOptionEnabled(FLAGS_measure_memory);
caffe2::NetDef net_def;
CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_net, &net_def));
setOperatorEngine(&net_def, FLAGS_backend);
if (!net_def.has_name()) {
net_def.set_name("benchmark");
}
caffe2::NetBase* net = workspace->CreateNet(net_def);
CHECK_NOTNULL(net);
runNetwork(
workspace,
net_def,
net,
tensor_protos_map,
FLAGS_wipe_cache,
FLAGS_run_individual,
@ -461,6 +498,12 @@ int benchmark(
FLAGS_sleep_between_net_and_operator,
FLAGS_output,
FLAGS_output_folder);
predict_vmem = getVirtualMemoryIfOptionEnabled(
FLAGS_measure_memory) - predict_vmem;
if (FLAGS_measure_memory) {
logBenchmarkResult(
"NET_", "memory", "kB", (init_vmem + predict_vmem) / 1024);
}
return 0;
}

View File

@ -125,7 +125,7 @@ void writeOutput(
const int num_blobs);
void runNetwork(
shared_ptr<caffe2::Workspace> workspace,
caffe2::NetDef& net_def,
caffe2::NetBase* net,
map<string, caffe2::TensorProtos>& tensor_protos_map,
const bool wipe_cache,
const bool run_individual,
@ -149,6 +149,7 @@ int benchmark(
const string& FLAGS_input_file,
const string& FLAGS_input_type,
int FLAGS_iter,
bool FLAGS_measure_memory,
const string& FLAGS_net,
const string& FLAGS_output,
const string& FLAGS_output_folder,

View File

@ -22,6 +22,7 @@ int main(int argc, char** argv) {
FLAGS_input_file,
FLAGS_input_type,
FLAGS_iter,
FLAGS_measure_memory,
FLAGS_net,
FLAGS_output,
FLAGS_output_folder,

View File

@ -19,6 +19,7 @@ struct PerformanceInformation {
std::string type = ""; // the type of the operator
// Measured
double latency = 0;
double cpuMilliseconds = 0;
};
class CAFFE2_OBSERVER_API NetObserverReporter {

View File

@ -1,5 +1,6 @@
#include "observers/net_observer_reporter_print.h"
#include <algorithm>
#include <sstream>
#include "caffe2/core/init.h"
#include "observers/observer_config.h"
@ -9,6 +10,7 @@ namespace caffe2 {
const std::string NetObserverReporterPrint::IDENTIFIER = "Caffe2Observer ";
static std::string get_op_args(PerformanceInformation p);
static std::string get_tensor_shapes(PerformanceInformation p);
static std::string sanatize(std::string json_s);
void NetObserverReporterPrint::report(
NetBase* net,
@ -23,29 +25,49 @@ void NetObserverReporterPrint::report(
{"value", c10::to_string(p.second.latency * 1000)},
{"unit", "us"},
{"metric", "latency"}});
caffe2_perf.push_back({{"type", "NET_"},
{
"value",
c10::to_string(
p.second.cpuMilliseconds /
p.second.latency *
100),
},
{"unit", "percent"},
{"metric", "cpu_percent"}});
} else if (p.first != "NET_DELAY") {
// for operator perf
std::string shape_str = get_tensor_shapes(p.second);
std::string args_str = get_op_args(p.second);
caffe2_perf.push_back({{"type", p.first},
std::string type = p.first;
caffe2_perf.push_back({{"type", type},
{"value", c10::to_string(p.second.latency * 1000)},
{"unit", "us"},
{"metric", "latency"}});
caffe2_perf.push_back({{"type", type},
{
"value",
c10::to_string(
p.second.cpuMilliseconds /
p.second.latency *
100),
},
{"unit", "percent"},
{"metric", "cpu_percent"}});
if (p.second.flops > 0) {
caffe2_perf.push_back({{"type", p.first},
caffe2_perf.push_back({{"type", type},
{"value", c10::to_string(p.second.flops)},
{"unit", "flop"},
{"metric", "flops"}});
}
if (shape_str != "") {
caffe2_perf.push_back({{"type", p.first},
caffe2_perf.push_back({{"type", type},
{"info_string", shape_str},
{"unit", ""},
{"metric", "tensor_shapes"}});
}
if (args_str != "") {
caffe2_perf.push_back({{"type", p.first},
caffe2_perf.push_back({{"type", type},
{"info_string", args_str},
{"unit", ""},
{"metric", "op_args"}});
@ -57,13 +79,13 @@ void NetObserverReporterPrint::report(
std::stringstream buffer;
auto entry = *it;
buffer << IDENTIFIER << "{";
buffer << "\"type\": \"" << entry["type"] << "\","
<< "\"unit\": \"" << entry["unit"] << "\","
<< "\"metric\": \"" << entry["metric"] << "\",";
buffer << "\"type\": \"" << sanatize(entry["type"]) << "\","
<< "\"unit\": \"" << sanatize(entry["unit"]) << "\","
<< "\"metric\": \"" << sanatize(entry["metric"]) << "\",";
if (entry.find("value") != entry.end()) {
buffer << "\"value\": \"" << entry["value"] << "\"";
buffer << "\"value\": \"" << sanatize(entry["value"]) << "\"";
} else if (entry.find("info_string") != entry.end()) {
buffer << "\"info_string\": \"" << entry["info_string"] << "\"";
buffer << "\"info_string\": \"" << sanatize(entry["info_string"]) << "\"";
}
buffer << "}";
LOG(INFO) << buffer.str();
@ -117,4 +139,12 @@ static std::string get_op_args(PerformanceInformation p) {
}
return args_str;
}
static std::string sanatize(std::string json_s) {
// Remove illegal characters from the name that would cause json string to
// become invalid
json_s.erase(std::remove(json_s.begin(), json_s.end(), '"'), json_s.end());
json_s.erase(std::remove(json_s.begin(), json_s.end(), '\\'), json_s.end());
return json_s;
}
}

View File

@ -6,6 +6,10 @@
#endif
#include <random>
#include <time.h>
#ifdef _WIN32
#include <windows.h>
#endif
#include "caffe2/core/common.h"
#include "caffe2/core/init.h"
#include "caffe2/core/operator.h"
@ -62,6 +66,75 @@ bool registerGlobalPerfNetObserverCreator(int* /*pargc*/, char*** /*pargv*/) {
}
} // namespace
#ifdef _WIN32
double getTicksPerMillisecond() {
static LARGE_INTEGER ticks_per_sec;
if (!ticks_per_sec.QuadPart) {
QueryPerformanceFrequency(&ticks_per_sec);
if (!ticks_per_sec.QuadPart) {
return 0.0;
}
}
return static_cast<double>(ticks_per_sec.QuadPart) / 1000.0;
}
#else
double getClockTimeMilliseconds(clockid_t clk_id) {
int result;
struct timespec tp;
result = clock_gettime(clk_id, &tp);
if (result == -1) {
return 0.0;
} else {
return tp.tv_sec * 1000.0 + tp.tv_nsec / 1000000.0;
}
}
#endif
double getWallClockTimeMilliseconds() {
#ifdef _WIN32
double ticks_per_ms = getTicksPerMillisecond();
if (ticks_per_ms) {
LARGE_INTEGER ticks;
if (QueryPerformanceCounter(&ticks)) {
return static_cast<double>(ticks.QuadPart) / ticks_per_ms;
}
}
return 0.0;
#else
return getClockTimeMilliseconds(CLOCK_MONOTONIC);
#endif
}
double getCpuTimeMilliseconds() {
#ifdef _WIN32
FILETIME creation_time;
FILETIME exit_time;
FILETIME kernel_time;
FILETIME user_time;
if (GetProcessTimes(
GetCurrentProcess(),
&creation_time,
&exit_time,
&kernel_time,
&user_time)) {
ULARGE_INTEGER kernel;
ULARGE_INTEGER user;
kernel.HighPart = kernel_time.dwHighDateTime;
kernel.LowPart = kernel_time.dwLowDateTime;
user.HighPart = user_time.dwHighDateTime;
user.LowPart = user_time.dwLowDateTime;
return (static_cast<double>(kernel.QuadPart) +
static_cast<double>(user.QuadPart)) / 10000.0;
}
return 0.0;
#else
return getClockTimeMilliseconds(CLOCK_PROCESS_CPUTIME_ID);
#endif
}
REGISTER_CAFFE2_EARLY_INIT_FUNCTION(
registerGlobalPerfNetObserverCreator,
&registerGlobalPerfNetObserverCreator,
@ -107,30 +180,31 @@ void PerfNetObserver::Start() {
}
}
if (logType_ != PerfNetObserver::NONE) {
/* Only start timer when we need to */
timer_.Start();
}
wallMilliseconds_ = getWallClockTimeMilliseconds();
cpuMilliseconds_ = getCpuTimeMilliseconds();
}
void PerfNetObserver::Stop() {
if (logType_ == PerfNetObserver::NONE) {
return;
}
auto currentRunTime = timer_.MilliSeconds();
std::map<std::string, PerformanceInformation> info;
PerformanceInformation net_perf;
net_perf.latency = currentRunTime;
net_perf.cpuMilliseconds =
getCpuTimeMilliseconds() - cpuMilliseconds_;
net_perf.latency =
getWallClockTimeMilliseconds() - wallMilliseconds_;
if (logType_ == PerfNetObserver::OPERATOR_DELAY) {
const auto& operators = subject_->GetOperators();
for (int idx = 0; idx < operators.size(); ++idx) {
const auto* op = operators[idx];
auto name = getObserverName(op, idx);
PerformanceInformation p;
p.latency = static_cast<const PerfOperatorObserver*>(observerMap_[op])
->getMilliseconds();
const PerfOperatorObserver* opObserver =
static_cast<const PerfOperatorObserver*>(observerMap_[op]);
p.latency = opObserver->getWallMilliseconds();
p.cpuMilliseconds = opObserver->getCpuMilliseconds();
p.engine = op->engine();
p.type = op->type();
p.tensor_shapes =
@ -176,30 +250,34 @@ PerfOperatorObserver::PerfOperatorObserver(
PerfNetObserver* netObserver)
: ObserverBase<OperatorBase>(op),
netObserver_(netObserver),
milliseconds_(0) {
wallMilliseconds_(0),
cpuMilliseconds_(0) {
CAFFE_ENFORCE(netObserver_, "Observers can't operate outside of the net");
}
PerfOperatorObserver::~PerfOperatorObserver() {}
void PerfOperatorObserver::Start() {
/* Get the time from the start of the net minus the time spent
in previous invocations. It is the time spent on other operators.
This way, when the operator finishes, the time from the start of the net
minus the time spent in all other operators is the total time on this
operator. This is done to avoid saving a timer in each operator */
milliseconds_ = netObserver_->getTimer().MilliSeconds() - milliseconds_;
wallMilliseconds_ = getWallClockTimeMilliseconds();
cpuMilliseconds_ = getCpuTimeMilliseconds();
}
void PerfOperatorObserver::Stop() {
/* Time from the start of the net minus the time spent on all other
operators is the time spent on this operator */
milliseconds_ = netObserver_->getTimer().MilliSeconds() - milliseconds_;
cpuMilliseconds_ =
getCpuTimeMilliseconds() - cpuMilliseconds_;
wallMilliseconds_ =
getWallClockTimeMilliseconds() - wallMilliseconds_;
tensor_shapes_ = subject_->InputTensorShapes();
}
double PerfOperatorObserver::getMilliseconds() const {
return milliseconds_;
double PerfOperatorObserver::getWallMilliseconds() const {
return wallMilliseconds_;
}
double PerfOperatorObserver::getCpuMilliseconds() const {
return cpuMilliseconds_;
}
std::vector<TensorShape> PerfOperatorObserver::getTensorShapes() const {

View File

@ -10,16 +10,13 @@
namespace caffe2 {
double getClockTimeMilliseconds();
class CAFFE2_OBSERVER_API PerfNetObserver : public NetObserver {
public:
explicit PerfNetObserver(NetBase* subject_);
virtual ~PerfNetObserver();
caffe2::Timer& getTimer() {
return timer_;
}
private:
void Start() override;
void Stop() override;
@ -37,7 +34,8 @@ class CAFFE2_OBSERVER_API PerfNetObserver : public NetObserver {
std::unordered_map<const OperatorBase*, const ObserverBase<OperatorBase>*>
observerMap_;
caffe2::Timer timer_;
double wallMilliseconds_;
double cpuMilliseconds_;
};
class PerfOperatorObserver : public ObserverBase<OperatorBase> {
@ -45,7 +43,8 @@ class PerfOperatorObserver : public ObserverBase<OperatorBase> {
PerfOperatorObserver(OperatorBase* op, PerfNetObserver* netObserver);
virtual ~PerfOperatorObserver();
double getMilliseconds() const;
double getWallMilliseconds() const;
double getCpuMilliseconds() const;
std::vector<TensorShape> getTensorShapes() const;
private:
@ -60,7 +59,8 @@ class PerfOperatorObserver : public ObserverBase<OperatorBase> {
// without storing inside the operator observer. Each field is memory
// costly here and a raw pointer is a cheapest sholution
PerfNetObserver* netObserver_;
double milliseconds_;
double wallMilliseconds_;
double cpuMilliseconds_;
std::vector<TensorShape> tensor_shapes_;
};
} // namespace caffe2