From e23e4cc356216a1002c2d4ccaa6908f34c44ee18 Mon Sep 17 00:00:00 2001
From: Geoffrey Goh <geoffreygoh@fb.com>
Date: Wed, 7 Aug 2019 15:32:18 -0700
Subject: [PATCH] Back out "Revert D16469619: Add Virtual Memory and CPU
 percentage computation to AIBench"

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/23821

Reviewed By: hl475

Differential Revision: D16654854

fbshipit-source-id: f057023e890cbcbd9145ef2ecb449df2fbba592b
---
 binaries/benchmark_args.h                     |   5 +
 binaries/benchmark_helper.cc                  |  75 ++++++++---
 binaries/benchmark_helper.h                   |   3 +-
 binaries/caffe2_benchmark.cc                  |   1 +
 modules/observers/net_observer_reporter.h     |   1 +
 .../observers/net_observer_reporter_print.cc  |  50 ++++++--
 modules/observers/perf_observer.cc            | 118 +++++++++++++++---
 modules/observers/perf_observer.h             |  14 +--
 8 files changed, 213 insertions(+), 54 deletions(-)

diff --git a/binaries/benchmark_args.h b/binaries/benchmark_args.h
index 891f62622ae41..8a7826d5f9942 100644
--- a/binaries/benchmark_args.h
+++ b/binaries/benchmark_args.h
@@ -50,6 +50,11 @@ C10_DEFINE_string(
     "Input type when specifying the input dimension."
     "The supported types are float, uint8_t.");
 C10_DEFINE_int(iter, 10, "The number of iterations to run.");
+C10_DEFINE_bool(
+    measure_memory,
+    false,
+    "Whether to measure increase in allocated memory while "
+    "loading and running the net.");
 C10_DEFINE_string(net, "", "The given net to benchmark.");
 C10_DEFINE_string(
     output,
diff --git a/binaries/benchmark_helper.cc b/binaries/benchmark_helper.cc
index 01d92a25bc77e..958e0bf4efe57 100644
--- a/binaries/benchmark_helper.cc
+++ b/binaries/benchmark_helper.cc
@@ -35,6 +35,14 @@
 #include <observers/observer_config.h>
 #include <observers/perf_observer.h>
 
+#if defined(TARGET_OS_MAC) || \
+defined(TARGET_OS_IPHONE) || \
+defined(TARGET_IPHONE_SIMULATOR)
+#include <malloc/malloc.h>
+#else
+#include <malloc.h>
+#endif
+
 using std::map;
 using std::shared_ptr;
 using std::string;
@@ -235,7 +243,7 @@ void fillInputBlob(
 
 void runNetwork(
     shared_ptr<caffe2::Workspace> workspace,
-    caffe2::NetDef& net_def,
+    caffe2::NetBase* net,
     map<string, caffe2::TensorProtos>& tensor_protos_map,
     const bool wipe_cache,
     const bool run_individual,
@@ -250,13 +258,6 @@ void runNetwork(
     const std::string& output,
     const std::string& output_folder) {
 
-  if (!net_def.has_name()) {
-    net_def.set_name("benchmark");
-  }
-
-  caffe2::NetBase* net = workspace->CreateNet(net_def);
-  CHECK_NOTNULL(net);
-
   LOG(INFO) << "Starting benchmark.";
   caffe2::ObserverConfig::initSampleRate(1, 1, 1, run_individual, warmup);
   LOG(INFO) << "Running warmup runs.";
@@ -376,6 +377,35 @@ void writeOutput(
   }
 }
 
+void logBenchmarkResult(
+    const std::string& type,
+    const std::string& metric,
+    const std::string& unit,
+    const int value) {
+  LOG(INFO) << caffe2::NetObserverReporterPrint::IDENTIFIER << "{"
+            << "\"type\": \"" << type << "\", "
+            << "\"metric\": \"" << metric << "\", "
+            << "\"unit\": \"" << unit << "\", "
+            << "\"value\": " << c10::to_string(value) << "}\n";
+}
+
+long getVirtualMemoryIfOptionEnabled(bool FLAGS_measure_memory) {
+  if (FLAGS_measure_memory) {
+#if defined(TARGET_OS_IPHONE) || \
+defined(TARGET_OS_MAC) || \
+defined(TARGET_IPHONE_SIMULATOR)
+    malloc_statistics_t stats = {0};
+    malloc_zone_statistics(nullptr, &stats);
+    return stats.size_allocated;
+#else
+    struct mallinfo info = mallinfo();
+    return info.uordblks;
+#endif
+  }
+
+  return 0;
+}
+
 int benchmark(
     int argc,
     char* argv[],
@@ -386,6 +416,7 @@ int benchmark(
     const string& FLAGS_input_file,
     const string& FLAGS_input_type,
     int FLAGS_iter,
+    bool FLAGS_measure_memory,
     const string& FLAGS_net,
     const string& FLAGS_output,
     const string& FLAGS_output_folder,
@@ -423,19 +454,15 @@ int benchmark(
 
   auto workspace = std::make_shared<caffe2::Workspace>(new caffe2::Workspace());
   bool run_on_gpu = backendCudaSet(FLAGS_backend);
-  // Run initialization network.
+  // Run initialization network, measure resources used.
+  long init_vmem = getVirtualMemoryIfOptionEnabled(FLAGS_measure_memory);
   caffe2::NetDef init_net_def;
   CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_init_net, &init_net_def));
   setOperatorEngine(&init_net_def, FLAGS_backend);
   CAFFE_ENFORCE(workspace->RunNetOnce(init_net_def));
-
-  // Run main network.
-  caffe2::NetDef net_def;
-  CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_net, &net_def));
-  setOperatorEngine(&net_def, FLAGS_backend);
+  init_vmem = getVirtualMemoryIfOptionEnabled(FLAGS_measure_memory) - init_vmem;
 
   map<string, caffe2::TensorProtos> tensor_protos_map;
-
   int num_blobs = loadInput(
       workspace,
       run_on_gpu,
@@ -445,9 +472,19 @@ int benchmark(
       FLAGS_input_dims,
       FLAGS_input_type);
 
+  // Run main network.
+  long predict_vmem = getVirtualMemoryIfOptionEnabled(FLAGS_measure_memory);
+  caffe2::NetDef net_def;
+  CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_net, &net_def));
+  setOperatorEngine(&net_def, FLAGS_backend);
+  if (!net_def.has_name()) {
+    net_def.set_name("benchmark");
+  }
+  caffe2::NetBase* net = workspace->CreateNet(net_def);
+  CHECK_NOTNULL(net);
   runNetwork(
       workspace,
-      net_def,
+      net,
       tensor_protos_map,
       FLAGS_wipe_cache,
       FLAGS_run_individual,
@@ -461,6 +498,12 @@ int benchmark(
       FLAGS_sleep_between_net_and_operator,
       FLAGS_output,
       FLAGS_output_folder);
+  predict_vmem = getVirtualMemoryIfOptionEnabled(
+      FLAGS_measure_memory) - predict_vmem;
+  if (FLAGS_measure_memory) {
+    logBenchmarkResult(
+        "NET_", "memory", "kB", (init_vmem + predict_vmem) / 1024);
+  }
 
   return 0;
 }
diff --git a/binaries/benchmark_helper.h b/binaries/benchmark_helper.h
index d09ba4e9c5594..f5630c112d19b 100644
--- a/binaries/benchmark_helper.h
+++ b/binaries/benchmark_helper.h
@@ -125,7 +125,7 @@ void writeOutput(
     const int num_blobs);
 void runNetwork(
     shared_ptr<caffe2::Workspace> workspace,
-    caffe2::NetDef& net_def,
+    caffe2::NetBase* net,
     map<string, caffe2::TensorProtos>& tensor_protos_map,
     const bool wipe_cache,
     const bool run_individual,
@@ -149,6 +149,7 @@ int benchmark(
     const string& FLAGS_input_file,
     const string& FLAGS_input_type,
     int FLAGS_iter,
+    bool FLAGS_measure_memory,
     const string& FLAGS_net,
     const string& FLAGS_output,
     const string& FLAGS_output_folder,
diff --git a/binaries/caffe2_benchmark.cc b/binaries/caffe2_benchmark.cc
index 57bd98d49723c..19ef223bdea6c 100644
--- a/binaries/caffe2_benchmark.cc
+++ b/binaries/caffe2_benchmark.cc
@@ -22,6 +22,7 @@ int main(int argc, char** argv) {
       FLAGS_input_file,
       FLAGS_input_type,
       FLAGS_iter,
+      FLAGS_measure_memory,
       FLAGS_net,
       FLAGS_output,
       FLAGS_output_folder,
diff --git a/modules/observers/net_observer_reporter.h b/modules/observers/net_observer_reporter.h
index 5619b69a636e7..bfccef64cee2b 100644
--- a/modules/observers/net_observer_reporter.h
+++ b/modules/observers/net_observer_reporter.h
@@ -19,6 +19,7 @@ struct PerformanceInformation {
   std::string type = ""; // the type of the operator
   // Measured
   double latency = 0;
+  double cpuMilliseconds = 0;
 };
 
 class CAFFE2_OBSERVER_API NetObserverReporter {
diff --git a/modules/observers/net_observer_reporter_print.cc b/modules/observers/net_observer_reporter_print.cc
index e00fe8c1fcc10..6baa7e01728dc 100644
--- a/modules/observers/net_observer_reporter_print.cc
+++ b/modules/observers/net_observer_reporter_print.cc
@@ -1,5 +1,6 @@
 #include "observers/net_observer_reporter_print.h"
 
+#include <algorithm>
 #include <sstream>
 #include "caffe2/core/init.h"
 #include "observers/observer_config.h"
@@ -9,6 +10,7 @@ namespace caffe2 {
 const std::string NetObserverReporterPrint::IDENTIFIER = "Caffe2Observer ";
 static std::string get_op_args(PerformanceInformation p);
 static std::string get_tensor_shapes(PerformanceInformation p);
+static std::string sanatize(std::string json_s);
 
 void NetObserverReporterPrint::report(
     NetBase* net,
@@ -23,29 +25,49 @@ void NetObserverReporterPrint::report(
                              {"value", c10::to_string(p.second.latency * 1000)},
                              {"unit", "us"},
                              {"metric", "latency"}});
+      caffe2_perf.push_back({{"type", "NET_"},
+                             {
+                               "value",
+                               c10::to_string(
+                                   p.second.cpuMilliseconds /
+                                   p.second.latency *
+                                   100),
+                             },
+                             {"unit", "percent"},
+                             {"metric", "cpu_percent"}});
     } else if (p.first != "NET_DELAY") {
       // for operator perf
       std::string shape_str = get_tensor_shapes(p.second);
       std::string args_str = get_op_args(p.second);
-
-      caffe2_perf.push_back({{"type", p.first},
+      std::string type = p.first;
+      caffe2_perf.push_back({{"type", type},
                              {"value", c10::to_string(p.second.latency * 1000)},
                              {"unit", "us"},
                              {"metric", "latency"}});
+      caffe2_perf.push_back({{"type", type},
+                             {
+                               "value",
+                               c10::to_string(
+                                   p.second.cpuMilliseconds /
+                                   p.second.latency *
+                                   100),
+                             },
+                             {"unit", "percent"},
+                             {"metric", "cpu_percent"}});
       if (p.second.flops > 0) {
-        caffe2_perf.push_back({{"type", p.first},
+        caffe2_perf.push_back({{"type", type},
                                {"value", c10::to_string(p.second.flops)},
                                {"unit", "flop"},
                                {"metric", "flops"}});
       }
       if (shape_str != "") {
-        caffe2_perf.push_back({{"type", p.first},
+        caffe2_perf.push_back({{"type", type},
                                {"info_string", shape_str},
                                {"unit", ""},
                                {"metric", "tensor_shapes"}});
       }
       if (args_str != "") {
-        caffe2_perf.push_back({{"type", p.first},
+        caffe2_perf.push_back({{"type", type},
                                {"info_string", args_str},
                                {"unit", ""},
                                {"metric", "op_args"}});
@@ -57,13 +79,13 @@ void NetObserverReporterPrint::report(
     std::stringstream buffer;
     auto entry = *it;
     buffer << IDENTIFIER << "{";
-    buffer << "\"type\": \"" << entry["type"] << "\","
-           << "\"unit\": \"" << entry["unit"] << "\","
-           << "\"metric\": \"" << entry["metric"] << "\",";
+    buffer << "\"type\": \"" << sanatize(entry["type"]) << "\","
+           << "\"unit\": \"" << sanatize(entry["unit"]) << "\","
+           << "\"metric\": \"" << sanatize(entry["metric"]) << "\",";
     if (entry.find("value") != entry.end()) {
-      buffer << "\"value\": \"" << entry["value"] << "\"";
+      buffer << "\"value\": \"" << sanatize(entry["value"]) << "\"";
     } else if (entry.find("info_string") != entry.end()) {
-      buffer << "\"info_string\": \"" << entry["info_string"] << "\"";
+      buffer << "\"info_string\": \"" << sanatize(entry["info_string"]) << "\"";
     }
     buffer << "}";
     LOG(INFO) << buffer.str();
@@ -117,4 +139,12 @@ static std::string get_op_args(PerformanceInformation p) {
   }
   return args_str;
 }
+
+static std::string sanatize(std::string json_s) {
+  // Remove illegal characters from the name that would cause json string to
+  // become invalid
+  json_s.erase(std::remove(json_s.begin(), json_s.end(), '"'), json_s.end());
+  json_s.erase(std::remove(json_s.begin(), json_s.end(), '\\'), json_s.end());
+  return json_s;
+}
 }
diff --git a/modules/observers/perf_observer.cc b/modules/observers/perf_observer.cc
index 2de1ce65114f5..9e60179832a44 100644
--- a/modules/observers/perf_observer.cc
+++ b/modules/observers/perf_observer.cc
@@ -6,6 +6,10 @@
 #endif
 
 #include <random>
+#include <time.h>
+#ifdef _WIN32
+#include <windows.h>
+#endif
 #include "caffe2/core/common.h"
 #include "caffe2/core/init.h"
 #include "caffe2/core/operator.h"
@@ -62,6 +66,75 @@ bool registerGlobalPerfNetObserverCreator(int* /*pargc*/, char*** /*pargv*/) {
 }
 } // namespace
 
+#ifdef _WIN32
+double getTicksPerMillisecond() {
+  static LARGE_INTEGER ticks_per_sec;
+  if (!ticks_per_sec.QuadPart) {
+    QueryPerformanceFrequency(&ticks_per_sec);
+    if (!ticks_per_sec.QuadPart) {
+      return 0.0;
+    }
+  }
+
+  return static_cast<double>(ticks_per_sec.QuadPart) / 1000.0;
+}
+#else
+double getClockTimeMilliseconds(clockid_t clk_id) {
+  int result;
+  struct timespec tp;
+  result = clock_gettime(clk_id, &tp);
+  if (result == -1) {
+    return 0.0;
+  } else {
+    return tp.tv_sec * 1000.0 + tp.tv_nsec / 1000000.0;
+  }
+}
+#endif
+
+double getWallClockTimeMilliseconds() {
+#ifdef _WIN32
+  double ticks_per_ms = getTicksPerMillisecond();
+  if (ticks_per_ms) {
+    LARGE_INTEGER ticks;
+    if (QueryPerformanceCounter(&ticks)) {
+      return static_cast<double>(ticks.QuadPart) / ticks_per_ms;
+    }
+  }
+
+  return 0.0;
+#else
+  return getClockTimeMilliseconds(CLOCK_MONOTONIC);
+#endif
+}
+
+double getCpuTimeMilliseconds() {
+#ifdef _WIN32
+  FILETIME creation_time;
+  FILETIME exit_time;
+  FILETIME kernel_time;
+  FILETIME user_time;
+  if (GetProcessTimes(
+      GetCurrentProcess(),
+      &creation_time,
+      &exit_time,
+      &kernel_time,
+      &user_time)) {
+    ULARGE_INTEGER kernel;
+    ULARGE_INTEGER user;
+    kernel.HighPart = kernel_time.dwHighDateTime;
+    kernel.LowPart = kernel_time.dwLowDateTime;
+    user.HighPart = user_time.dwHighDateTime;
+    user.LowPart = user_time.dwLowDateTime;
+    return (static_cast<double>(kernel.QuadPart) +
+        static_cast<double>(user.QuadPart)) / 10000.0;
+  }
+
+  return 0.0;
+#else
+  return getClockTimeMilliseconds(CLOCK_PROCESS_CPUTIME_ID);
+#endif
+}
+
 REGISTER_CAFFE2_EARLY_INIT_FUNCTION(
     registerGlobalPerfNetObserverCreator,
     &registerGlobalPerfNetObserverCreator,
@@ -107,30 +180,31 @@ void PerfNetObserver::Start() {
     }
   }
 
-  if (logType_ != PerfNetObserver::NONE) {
-    /* Only start timer when we need to */
-    timer_.Start();
-  }
+  wallMilliseconds_ = getWallClockTimeMilliseconds();
+  cpuMilliseconds_ = getCpuTimeMilliseconds();
 }
 
 void PerfNetObserver::Stop() {
   if (logType_ == PerfNetObserver::NONE) {
     return;
   }
-  auto currentRunTime = timer_.MilliSeconds();
   std::map<std::string, PerformanceInformation> info;
   PerformanceInformation net_perf;
-  net_perf.latency = currentRunTime;
+  net_perf.cpuMilliseconds =
+      getCpuTimeMilliseconds() - cpuMilliseconds_;
+  net_perf.latency =
+      getWallClockTimeMilliseconds() - wallMilliseconds_;
+
   if (logType_ == PerfNetObserver::OPERATOR_DELAY) {
     const auto& operators = subject_->GetOperators();
     for (int idx = 0; idx < operators.size(); ++idx) {
       const auto* op = operators[idx];
       auto name = getObserverName(op, idx);
       PerformanceInformation p;
-
-      p.latency = static_cast<const PerfOperatorObserver*>(observerMap_[op])
-                      ->getMilliseconds();
-
+      const PerfOperatorObserver* opObserver =
+          static_cast<const PerfOperatorObserver*>(observerMap_[op]);
+      p.latency = opObserver->getWallMilliseconds();
+      p.cpuMilliseconds = opObserver->getCpuMilliseconds();
       p.engine = op->engine();
       p.type = op->type();
       p.tensor_shapes =
@@ -176,30 +250,34 @@ PerfOperatorObserver::PerfOperatorObserver(
     PerfNetObserver* netObserver)
     : ObserverBase<OperatorBase>(op),
       netObserver_(netObserver),
-      milliseconds_(0) {
+      wallMilliseconds_(0),
+      cpuMilliseconds_(0) {
   CAFFE_ENFORCE(netObserver_, "Observers can't operate outside of the net");
 }
 
 PerfOperatorObserver::~PerfOperatorObserver() {}
 
 void PerfOperatorObserver::Start() {
-  /* Get the time from the start of the net minus the time spent
-     in previous invocations. It is the time spent on other operators.
-     This way, when the operator finishes, the time from the start of the net
-     minus the time spent in all other operators  is the total time on this
-     operator. This is done to avoid saving a timer in each operator */
-  milliseconds_ = netObserver_->getTimer().MilliSeconds() - milliseconds_;
+  wallMilliseconds_ = getWallClockTimeMilliseconds();
+  cpuMilliseconds_ = getCpuTimeMilliseconds();
 }
 
 void PerfOperatorObserver::Stop() {
   /* Time from the start of the net minus the time spent on all other
      operators is the time spent on this operator */
-  milliseconds_ = netObserver_->getTimer().MilliSeconds() - milliseconds_;
+  cpuMilliseconds_ =
+      getCpuTimeMilliseconds() - cpuMilliseconds_;
+  wallMilliseconds_ =
+      getWallClockTimeMilliseconds() - wallMilliseconds_;
   tensor_shapes_ = subject_->InputTensorShapes();
 }
 
-double PerfOperatorObserver::getMilliseconds() const {
-  return milliseconds_;
+double PerfOperatorObserver::getWallMilliseconds() const {
+  return wallMilliseconds_;
+}
+
+double PerfOperatorObserver::getCpuMilliseconds() const {
+  return cpuMilliseconds_;
 }
 
 std::vector<TensorShape> PerfOperatorObserver::getTensorShapes() const {
diff --git a/modules/observers/perf_observer.h b/modules/observers/perf_observer.h
index 11fb870a61961..71e1190e840ba 100644
--- a/modules/observers/perf_observer.h
+++ b/modules/observers/perf_observer.h
@@ -10,16 +10,13 @@
 
 namespace caffe2 {
 
+double getClockTimeMilliseconds();
 
 class CAFFE2_OBSERVER_API PerfNetObserver : public NetObserver {
  public:
   explicit PerfNetObserver(NetBase* subject_);
   virtual ~PerfNetObserver();
 
-  caffe2::Timer& getTimer() {
-    return timer_;
-  }
-
  private:
   void Start() override;
   void Stop() override;
@@ -37,7 +34,8 @@ class CAFFE2_OBSERVER_API PerfNetObserver : public NetObserver {
   std::unordered_map<const OperatorBase*, const ObserverBase<OperatorBase>*>
       observerMap_;
 
-  caffe2::Timer timer_;
+  double wallMilliseconds_;
+  double cpuMilliseconds_;
 };
 
 class PerfOperatorObserver : public ObserverBase<OperatorBase> {
@@ -45,7 +43,8 @@ class PerfOperatorObserver : public ObserverBase<OperatorBase> {
   PerfOperatorObserver(OperatorBase* op, PerfNetObserver* netObserver);
   virtual ~PerfOperatorObserver();
 
-  double getMilliseconds() const;
+  double getWallMilliseconds() const;
+  double getCpuMilliseconds() const;
   std::vector<TensorShape> getTensorShapes() const;
 
  private:
@@ -60,7 +59,8 @@ class PerfOperatorObserver : public ObserverBase<OperatorBase> {
   // without storing inside the operator observer. Each field is memory
   // costly here and a raw pointer is a cheapest sholution
   PerfNetObserver* netObserver_;
-  double milliseconds_;
+  double wallMilliseconds_;
+  double cpuMilliseconds_;
   std::vector<TensorShape> tensor_shapes_;
 };
 } // namespace caffe2