Back out "Revert D16469619: Add Virtual Memory and CPU percentage computation to AIBench"

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/23821 Reviewed By: hl475 Differential Revision: D16654854 fbshipit-source-id: f057023e890cbcbd9145ef2ecb449df2fbba592b
2025-10-20 21:14:14 +08:00 · 2019-08-07 15:32:18 -07:00
parent e90adf59a0
commit e23e4cc356
8 changed files with 213 additions and 54 deletions
--- a/binaries/benchmark_args.h
+++ b/binaries/benchmark_args.h
@ -50,6 +50,11 @@ C10_DEFINE_string(
    "Input type when specifying the input dimension."
    "The supported types are float, uint8_t.");
 C10_DEFINE_int(iter, 10, "The number of iterations to run.");
+C10_DEFINE_bool(
+    measure_memory,
+    false,
+    "Whether to measure increase in allocated memory while "
+    "loading and running the net.");
 C10_DEFINE_string(net, "", "The given net to benchmark.");
 C10_DEFINE_string(
    output,
--- a/binaries/benchmark_helper.cc
+++ b/binaries/benchmark_helper.cc
@ -35,6 +35,14 @@
 #include <observers/observer_config.h>
 #include <observers/perf_observer.h>

+#if defined(TARGET_OS_MAC) || \
+defined(TARGET_OS_IPHONE) || \
+defined(TARGET_IPHONE_SIMULATOR)
+#include <malloc/malloc.h>
+#else
+#include <malloc.h>
+#endif
+
 using std::map;
 using std::shared_ptr;
 using std::string;
@ -235,7 +243,7 @@ void fillInputBlob(

 void runNetwork(
    shared_ptr<caffe2::Workspace> workspace,
-    caffe2::NetDef& net_def,
+    caffe2::NetBase* net,
    map<string, caffe2::TensorProtos>& tensor_protos_map,
    const bool wipe_cache,
    const bool run_individual,
@ -250,13 +258,6 @@ void runNetwork(
    const std::string& output,
    const std::string& output_folder) {

-  if (!net_def.has_name()) {
-    net_def.set_name("benchmark");
-  }
-
-  caffe2::NetBase* net = workspace->CreateNet(net_def);
-  CHECK_NOTNULL(net);
-
  LOG(INFO) << "Starting benchmark.";
  caffe2::ObserverConfig::initSampleRate(1, 1, 1, run_individual, warmup);
  LOG(INFO) << "Running warmup runs.";
@ -376,6 +377,35 @@ void writeOutput(
  }
 }

+void logBenchmarkResult(
+    const std::string& type,
+    const std::string& metric,
+    const std::string& unit,
+    const int value) {
+  LOG(INFO) << caffe2::NetObserverReporterPrint::IDENTIFIER << "{"
+            << "\"type\": \"" << type << "\", "
+            << "\"metric\": \"" << metric << "\", "
+            << "\"unit\": \"" << unit << "\", "
+            << "\"value\": " << c10::to_string(value) << "}\n";
+}
+
+long getVirtualMemoryIfOptionEnabled(bool FLAGS_measure_memory) {
+  if (FLAGS_measure_memory) {
+#if defined(TARGET_OS_IPHONE) || \
+defined(TARGET_OS_MAC) || \
+defined(TARGET_IPHONE_SIMULATOR)
+    malloc_statistics_t stats = {0};
+    malloc_zone_statistics(nullptr, &stats);
+    return stats.size_allocated;
+#else
+    struct mallinfo info = mallinfo();
+    return info.uordblks;
+#endif
+  }
+
+  return 0;
+}
+
 int benchmark(
    int argc,
    char* argv[],
@ -386,6 +416,7 @@ int benchmark(
    const string& FLAGS_input_file,
    const string& FLAGS_input_type,
    int FLAGS_iter,
+    bool FLAGS_measure_memory,
    const string& FLAGS_net,
    const string& FLAGS_output,
    const string& FLAGS_output_folder,
@ -423,19 +454,15 @@ int benchmark(

  auto workspace = std::make_shared<caffe2::Workspace>(new caffe2::Workspace());
  bool run_on_gpu = backendCudaSet(FLAGS_backend);
-  // Run initialization network.
+  // Run initialization network, measure resources used.
+  long init_vmem = getVirtualMemoryIfOptionEnabled(FLAGS_measure_memory);
  caffe2::NetDef init_net_def;
  CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_init_net, &init_net_def));
  setOperatorEngine(&init_net_def, FLAGS_backend);
  CAFFE_ENFORCE(workspace->RunNetOnce(init_net_def));
-
-  // Run main network.
-  caffe2::NetDef net_def;
-  CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_net, &net_def));
-  setOperatorEngine(&net_def, FLAGS_backend);
+  init_vmem = getVirtualMemoryIfOptionEnabled(FLAGS_measure_memory) - init_vmem;

  map<string, caffe2::TensorProtos> tensor_protos_map;
-
  int num_blobs = loadInput(
      workspace,
      run_on_gpu,
@ -445,9 +472,19 @@ int benchmark(
      FLAGS_input_dims,
      FLAGS_input_type);

+  // Run main network.
+  long predict_vmem = getVirtualMemoryIfOptionEnabled(FLAGS_measure_memory);
+  caffe2::NetDef net_def;
+  CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_net, &net_def));
+  setOperatorEngine(&net_def, FLAGS_backend);
+  if (!net_def.has_name()) {
+    net_def.set_name("benchmark");
+  }
+  caffe2::NetBase* net = workspace->CreateNet(net_def);
+  CHECK_NOTNULL(net);
  runNetwork(
      workspace,
-      net_def,
+      net,
      tensor_protos_map,
      FLAGS_wipe_cache,
      FLAGS_run_individual,
@ -461,6 +498,12 @@ int benchmark(
      FLAGS_sleep_between_net_and_operator,
      FLAGS_output,
      FLAGS_output_folder);
+  predict_vmem = getVirtualMemoryIfOptionEnabled(
+      FLAGS_measure_memory) - predict_vmem;
+  if (FLAGS_measure_memory) {
+    logBenchmarkResult(
+        "NET_", "memory", "kB", (init_vmem + predict_vmem) / 1024);
+  }

  return 0;
 }
--- a/binaries/benchmark_helper.h
+++ b/binaries/benchmark_helper.h
@ -125,7 +125,7 @@ void writeOutput(
    const int num_blobs);
 void runNetwork(
    shared_ptr<caffe2::Workspace> workspace,
-    caffe2::NetDef& net_def,
+    caffe2::NetBase* net,
    map<string, caffe2::TensorProtos>& tensor_protos_map,
    const bool wipe_cache,
    const bool run_individual,
@ -149,6 +149,7 @@ int benchmark(
    const string& FLAGS_input_file,
    const string& FLAGS_input_type,
    int FLAGS_iter,
+    bool FLAGS_measure_memory,
    const string& FLAGS_net,
    const string& FLAGS_output,
    const string& FLAGS_output_folder,
--- a/binaries/caffe2_benchmark.cc
+++ b/binaries/caffe2_benchmark.cc
@ -22,6 +22,7 @@ int main(int argc, char** argv) {
      FLAGS_input_file,
      FLAGS_input_type,
      FLAGS_iter,
+      FLAGS_measure_memory,
      FLAGS_net,
      FLAGS_output,
      FLAGS_output_folder,
--- a/modules/observers/net_observer_reporter.h
+++ b/modules/observers/net_observer_reporter.h
@ -19,6 +19,7 @@ struct PerformanceInformation {
  std::string type = ""; // the type of the operator
  // Measured
  double latency = 0;
+  double cpuMilliseconds = 0;
 };

 class CAFFE2_OBSERVER_API NetObserverReporter {
--- a/modules/observers/net_observer_reporter_print.cc
+++ b/modules/observers/net_observer_reporter_print.cc
@ -1,5 +1,6 @@
 #include "observers/net_observer_reporter_print.h"

+#include <algorithm>
 #include <sstream>
 #include "caffe2/core/init.h"
 #include "observers/observer_config.h"
@ -9,6 +10,7 @@ namespace caffe2 {
 const std::string NetObserverReporterPrint::IDENTIFIER = "Caffe2Observer ";
 static std::string get_op_args(PerformanceInformation p);
 static std::string get_tensor_shapes(PerformanceInformation p);
+static std::string sanatize(std::string json_s);

 void NetObserverReporterPrint::report(
    NetBase* net,
@ -23,29 +25,49 @@ void NetObserverReporterPrint::report(
                             {"value", c10::to_string(p.second.latency * 1000)},
                             {"unit", "us"},
                             {"metric", "latency"}});
+      caffe2_perf.push_back({{"type", "NET_"},
+                             {
+                               "value",
+                               c10::to_string(
+                                   p.second.cpuMilliseconds /
+                                   p.second.latency *
+                                   100),
+                             },
+                             {"unit", "percent"},
+                             {"metric", "cpu_percent"}});
    } else if (p.first != "NET_DELAY") {
      // for operator perf
      std::string shape_str = get_tensor_shapes(p.second);
      std::string args_str = get_op_args(p.second);
-
-      caffe2_perf.push_back({{"type", p.first},
+      std::string type = p.first;
+      caffe2_perf.push_back({{"type", type},
                             {"value", c10::to_string(p.second.latency * 1000)},
                             {"unit", "us"},
                             {"metric", "latency"}});
+      caffe2_perf.push_back({{"type", type},
+                             {
+                               "value",
+                               c10::to_string(
+                                   p.second.cpuMilliseconds /
+                                   p.second.latency *
+                                   100),
+                             },
+                             {"unit", "percent"},
+                             {"metric", "cpu_percent"}});
      if (p.second.flops > 0) {
-        caffe2_perf.push_back({{"type", p.first},
+        caffe2_perf.push_back({{"type", type},
                               {"value", c10::to_string(p.second.flops)},
                               {"unit", "flop"},
                               {"metric", "flops"}});
      }
      if (shape_str != "") {
-        caffe2_perf.push_back({{"type", p.first},
+        caffe2_perf.push_back({{"type", type},
                               {"info_string", shape_str},
                               {"unit", ""},
                               {"metric", "tensor_shapes"}});
      }
      if (args_str != "") {
-        caffe2_perf.push_back({{"type", p.first},
+        caffe2_perf.push_back({{"type", type},
                               {"info_string", args_str},
                               {"unit", ""},
                               {"metric", "op_args"}});
@ -57,13 +79,13 @@ void NetObserverReporterPrint::report(
    std::stringstream buffer;
    auto entry = *it;
    buffer << IDENTIFIER << "{";
-    buffer << "\"type\": \"" << entry["type"] << "\","
-           << "\"unit\": \"" << entry["unit"] << "\","
-           << "\"metric\": \"" << entry["metric"] << "\",";
+    buffer << "\"type\": \"" << sanatize(entry["type"]) << "\","
+           << "\"unit\": \"" << sanatize(entry["unit"]) << "\","
+           << "\"metric\": \"" << sanatize(entry["metric"]) << "\",";
    if (entry.find("value") != entry.end()) {
-      buffer << "\"value\": \"" << entry["value"] << "\"";
+      buffer << "\"value\": \"" << sanatize(entry["value"]) << "\"";
    } else if (entry.find("info_string") != entry.end()) {
-      buffer << "\"info_string\": \"" << entry["info_string"] << "\"";
+      buffer << "\"info_string\": \"" << sanatize(entry["info_string"]) << "\"";
    }
    buffer << "}";
    LOG(INFO) << buffer.str();
@ -117,4 +139,12 @@ static std::string get_op_args(PerformanceInformation p) {
  }
  return args_str;
 }
+
+static std::string sanatize(std::string json_s) {
+  // Remove illegal characters from the name that would cause json string to
+  // become invalid
+  json_s.erase(std::remove(json_s.begin(), json_s.end(), '"'), json_s.end());
+  json_s.erase(std::remove(json_s.begin(), json_s.end(), '\\'), json_s.end());
+  return json_s;
+}
 }
--- a/modules/observers/perf_observer.cc
+++ b/modules/observers/perf_observer.cc
@ -6,6 +6,10 @@
 #endif

 #include <random>
+#include <time.h>
+#ifdef _WIN32
+#include <windows.h>
+#endif
 #include "caffe2/core/common.h"
 #include "caffe2/core/init.h"
 #include "caffe2/core/operator.h"
@ -62,6 +66,75 @@ bool registerGlobalPerfNetObserverCreator(int* /*pargc*/, char*** /*pargv*/) {
 }
 } // namespace

+#ifdef _WIN32
+double getTicksPerMillisecond() {
+  static LARGE_INTEGER ticks_per_sec;
+  if (!ticks_per_sec.QuadPart) {
+    QueryPerformanceFrequency(&ticks_per_sec);
+    if (!ticks_per_sec.QuadPart) {
+      return 0.0;
+    }
+  }
+
+  return static_cast<double>(ticks_per_sec.QuadPart) / 1000.0;
+}
+#else
+double getClockTimeMilliseconds(clockid_t clk_id) {
+  int result;
+  struct timespec tp;
+  result = clock_gettime(clk_id, &tp);
+  if (result == -1) {
+    return 0.0;
+  } else {
+    return tp.tv_sec * 1000.0 + tp.tv_nsec / 1000000.0;
+  }
+}
+#endif
+
+double getWallClockTimeMilliseconds() {
+#ifdef _WIN32
+  double ticks_per_ms = getTicksPerMillisecond();
+  if (ticks_per_ms) {
+    LARGE_INTEGER ticks;
+    if (QueryPerformanceCounter(&ticks)) {
+      return static_cast<double>(ticks.QuadPart) / ticks_per_ms;
+    }
+  }
+
+  return 0.0;
+#else
+  return getClockTimeMilliseconds(CLOCK_MONOTONIC);
+#endif
+}
+
+double getCpuTimeMilliseconds() {
+#ifdef _WIN32
+  FILETIME creation_time;
+  FILETIME exit_time;
+  FILETIME kernel_time;
+  FILETIME user_time;
+  if (GetProcessTimes(
+      GetCurrentProcess(),
+      &creation_time,
+      &exit_time,
+      &kernel_time,
+      &user_time)) {
+    ULARGE_INTEGER kernel;
+    ULARGE_INTEGER user;
+    kernel.HighPart = kernel_time.dwHighDateTime;
+    kernel.LowPart = kernel_time.dwLowDateTime;
+    user.HighPart = user_time.dwHighDateTime;
+    user.LowPart = user_time.dwLowDateTime;
+    return (static_cast<double>(kernel.QuadPart) +
+        static_cast<double>(user.QuadPart)) / 10000.0;
+  }
+
+  return 0.0;
+#else
+  return getClockTimeMilliseconds(CLOCK_PROCESS_CPUTIME_ID);
+#endif
+}
+
 REGISTER_CAFFE2_EARLY_INIT_FUNCTION(
    registerGlobalPerfNetObserverCreator,
    &registerGlobalPerfNetObserverCreator,
@ -107,30 +180,31 @@ void PerfNetObserver::Start() {
    }
  }

-  if (logType_ != PerfNetObserver::NONE) {
-    /* Only start timer when we need to */
-    timer_.Start();
-  }
+  wallMilliseconds_ = getWallClockTimeMilliseconds();
+  cpuMilliseconds_ = getCpuTimeMilliseconds();
 }

 void PerfNetObserver::Stop() {
  if (logType_ == PerfNetObserver::NONE) {
    return;
  }
-  auto currentRunTime = timer_.MilliSeconds();
  std::map<std::string, PerformanceInformation> info;
  PerformanceInformation net_perf;
-  net_perf.latency = currentRunTime;
+  net_perf.cpuMilliseconds =
+      getCpuTimeMilliseconds() - cpuMilliseconds_;
+  net_perf.latency =
+      getWallClockTimeMilliseconds() - wallMilliseconds_;
+
  if (logType_ == PerfNetObserver::OPERATOR_DELAY) {
    const auto& operators = subject_->GetOperators();
    for (int idx = 0; idx < operators.size(); ++idx) {
      const auto* op = operators[idx];
      auto name = getObserverName(op, idx);
      PerformanceInformation p;
-
-      p.latency = static_cast<const PerfOperatorObserver*>(observerMap_[op])
-                      ->getMilliseconds();
-
+      const PerfOperatorObserver* opObserver =
+          static_cast<const PerfOperatorObserver*>(observerMap_[op]);
+      p.latency = opObserver->getWallMilliseconds();
+      p.cpuMilliseconds = opObserver->getCpuMilliseconds();
      p.engine = op->engine();
      p.type = op->type();
      p.tensor_shapes =
@ -176,30 +250,34 @@ PerfOperatorObserver::PerfOperatorObserver(
    PerfNetObserver* netObserver)
    : ObserverBase<OperatorBase>(op),
      netObserver_(netObserver),
-      milliseconds_(0) {
+      wallMilliseconds_(0),
+      cpuMilliseconds_(0) {
  CAFFE_ENFORCE(netObserver_, "Observers can't operate outside of the net");
 }

 PerfOperatorObserver::~PerfOperatorObserver() {}

 void PerfOperatorObserver::Start() {
-  /* Get the time from the start of the net minus the time spent
-     in previous invocations. It is the time spent on other operators.
-     This way, when the operator finishes, the time from the start of the net
-     minus the time spent in all other operators  is the total time on this
-     operator. This is done to avoid saving a timer in each operator */
-  milliseconds_ = netObserver_->getTimer().MilliSeconds() - milliseconds_;
+  wallMilliseconds_ = getWallClockTimeMilliseconds();
+  cpuMilliseconds_ = getCpuTimeMilliseconds();
 }

 void PerfOperatorObserver::Stop() {
  /* Time from the start of the net minus the time spent on all other
     operators is the time spent on this operator */
-  milliseconds_ = netObserver_->getTimer().MilliSeconds() - milliseconds_;
+  cpuMilliseconds_ =
+      getCpuTimeMilliseconds() - cpuMilliseconds_;
+  wallMilliseconds_ =
+      getWallClockTimeMilliseconds() - wallMilliseconds_;
  tensor_shapes_ = subject_->InputTensorShapes();
 }

-double PerfOperatorObserver::getMilliseconds() const {
-  return milliseconds_;
+double PerfOperatorObserver::getWallMilliseconds() const {
+  return wallMilliseconds_;
+}
+
+double PerfOperatorObserver::getCpuMilliseconds() const {
+  return cpuMilliseconds_;
 }

 std::vector<TensorShape> PerfOperatorObserver::getTensorShapes() const {
--- a/modules/observers/perf_observer.h
+++ b/modules/observers/perf_observer.h
@ -10,16 +10,13 @@

 namespace caffe2 {

+double getClockTimeMilliseconds();

 class CAFFE2_OBSERVER_API PerfNetObserver : public NetObserver {
 public:
  explicit PerfNetObserver(NetBase* subject_);
  virtual ~PerfNetObserver();

-  caffe2::Timer& getTimer() {
-    return timer_;
-  }
-
 private:
  void Start() override;
  void Stop() override;
@ -37,7 +34,8 @@ class CAFFE2_OBSERVER_API PerfNetObserver : public NetObserver {
  std::unordered_map<const OperatorBase*, const ObserverBase<OperatorBase>*>
      observerMap_;

-  caffe2::Timer timer_;
+  double wallMilliseconds_;
+  double cpuMilliseconds_;
 };

 class PerfOperatorObserver : public ObserverBase<OperatorBase> {
@ -45,7 +43,8 @@ class PerfOperatorObserver : public ObserverBase<OperatorBase> {
  PerfOperatorObserver(OperatorBase* op, PerfNetObserver* netObserver);
  virtual ~PerfOperatorObserver();

-  double getMilliseconds() const;
+  double getWallMilliseconds() const;
+  double getCpuMilliseconds() const;
  std::vector<TensorShape> getTensorShapes() const;

 private:
@ -60,7 +59,8 @@ class PerfOperatorObserver : public ObserverBase<OperatorBase> {
  // without storing inside the operator observer. Each field is memory
  // costly here and a raw pointer is a cheapest sholution
  PerfNetObserver* netObserver_;
-  double milliseconds_;
+  double wallMilliseconds_;
+  double cpuMilliseconds_;
  std::vector<TensorShape> tensor_shapes_;
 };
 } // namespace caffe2