From e23e4cc356216a1002c2d4ccaa6908f34c44ee18 Mon Sep 17 00:00:00 2001 From: Geoffrey Goh Date: Wed, 7 Aug 2019 15:32:18 -0700 Subject: [PATCH] Back out "Revert D16469619: Add Virtual Memory and CPU percentage computation to AIBench" Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/23821 Reviewed By: hl475 Differential Revision: D16654854 fbshipit-source-id: f057023e890cbcbd9145ef2ecb449df2fbba592b --- binaries/benchmark_args.h | 5 + binaries/benchmark_helper.cc | 75 ++++++++--- binaries/benchmark_helper.h | 3 +- binaries/caffe2_benchmark.cc | 1 + modules/observers/net_observer_reporter.h | 1 + .../observers/net_observer_reporter_print.cc | 50 ++++++-- modules/observers/perf_observer.cc | 118 +++++++++++++++--- modules/observers/perf_observer.h | 14 +-- 8 files changed, 213 insertions(+), 54 deletions(-) diff --git a/binaries/benchmark_args.h b/binaries/benchmark_args.h index 891f62622ae41..8a7826d5f9942 100644 --- a/binaries/benchmark_args.h +++ b/binaries/benchmark_args.h @@ -50,6 +50,11 @@ C10_DEFINE_string( "Input type when specifying the input dimension." "The supported types are float, uint8_t."); C10_DEFINE_int(iter, 10, "The number of iterations to run."); +C10_DEFINE_bool( + measure_memory, + false, + "Whether to measure increase in allocated memory while " + "loading and running the net."); C10_DEFINE_string(net, "", "The given net to benchmark."); C10_DEFINE_string( output, diff --git a/binaries/benchmark_helper.cc b/binaries/benchmark_helper.cc index 01d92a25bc77e..958e0bf4efe57 100644 --- a/binaries/benchmark_helper.cc +++ b/binaries/benchmark_helper.cc @@ -35,6 +35,14 @@ #include #include +#if defined(TARGET_OS_MAC) || \ +defined(TARGET_OS_IPHONE) || \ +defined(TARGET_IPHONE_SIMULATOR) +#include +#else +#include +#endif + using std::map; using std::shared_ptr; using std::string; @@ -235,7 +243,7 @@ void fillInputBlob( void runNetwork( shared_ptr workspace, - caffe2::NetDef& net_def, + caffe2::NetBase* net, map& tensor_protos_map, const bool wipe_cache, const bool run_individual, @@ -250,13 +258,6 @@ void runNetwork( const std::string& output, const std::string& output_folder) { - if (!net_def.has_name()) { - net_def.set_name("benchmark"); - } - - caffe2::NetBase* net = workspace->CreateNet(net_def); - CHECK_NOTNULL(net); - LOG(INFO) << "Starting benchmark."; caffe2::ObserverConfig::initSampleRate(1, 1, 1, run_individual, warmup); LOG(INFO) << "Running warmup runs."; @@ -376,6 +377,35 @@ void writeOutput( } } +void logBenchmarkResult( + const std::string& type, + const std::string& metric, + const std::string& unit, + const int value) { + LOG(INFO) << caffe2::NetObserverReporterPrint::IDENTIFIER << "{" + << "\"type\": \"" << type << "\", " + << "\"metric\": \"" << metric << "\", " + << "\"unit\": \"" << unit << "\", " + << "\"value\": " << c10::to_string(value) << "}\n"; +} + +long getVirtualMemoryIfOptionEnabled(bool FLAGS_measure_memory) { + if (FLAGS_measure_memory) { +#if defined(TARGET_OS_IPHONE) || \ +defined(TARGET_OS_MAC) || \ +defined(TARGET_IPHONE_SIMULATOR) + malloc_statistics_t stats = {0}; + malloc_zone_statistics(nullptr, &stats); + return stats.size_allocated; +#else + struct mallinfo info = mallinfo(); + return info.uordblks; +#endif + } + + return 0; +} + int benchmark( int argc, char* argv[], @@ -386,6 +416,7 @@ int benchmark( const string& FLAGS_input_file, const string& FLAGS_input_type, int FLAGS_iter, + bool FLAGS_measure_memory, const string& FLAGS_net, const string& FLAGS_output, const string& FLAGS_output_folder, @@ -423,19 +454,15 @@ int benchmark( auto workspace = std::make_shared(new caffe2::Workspace()); bool run_on_gpu = backendCudaSet(FLAGS_backend); - // Run initialization network. + // Run initialization network, measure resources used. + long init_vmem = getVirtualMemoryIfOptionEnabled(FLAGS_measure_memory); caffe2::NetDef init_net_def; CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_init_net, &init_net_def)); setOperatorEngine(&init_net_def, FLAGS_backend); CAFFE_ENFORCE(workspace->RunNetOnce(init_net_def)); - - // Run main network. - caffe2::NetDef net_def; - CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_net, &net_def)); - setOperatorEngine(&net_def, FLAGS_backend); + init_vmem = getVirtualMemoryIfOptionEnabled(FLAGS_measure_memory) - init_vmem; map tensor_protos_map; - int num_blobs = loadInput( workspace, run_on_gpu, @@ -445,9 +472,19 @@ int benchmark( FLAGS_input_dims, FLAGS_input_type); + // Run main network. + long predict_vmem = getVirtualMemoryIfOptionEnabled(FLAGS_measure_memory); + caffe2::NetDef net_def; + CAFFE_ENFORCE(ReadProtoFromFile(FLAGS_net, &net_def)); + setOperatorEngine(&net_def, FLAGS_backend); + if (!net_def.has_name()) { + net_def.set_name("benchmark"); + } + caffe2::NetBase* net = workspace->CreateNet(net_def); + CHECK_NOTNULL(net); runNetwork( workspace, - net_def, + net, tensor_protos_map, FLAGS_wipe_cache, FLAGS_run_individual, @@ -461,6 +498,12 @@ int benchmark( FLAGS_sleep_between_net_and_operator, FLAGS_output, FLAGS_output_folder); + predict_vmem = getVirtualMemoryIfOptionEnabled( + FLAGS_measure_memory) - predict_vmem; + if (FLAGS_measure_memory) { + logBenchmarkResult( + "NET_", "memory", "kB", (init_vmem + predict_vmem) / 1024); + } return 0; } diff --git a/binaries/benchmark_helper.h b/binaries/benchmark_helper.h index d09ba4e9c5594..f5630c112d19b 100644 --- a/binaries/benchmark_helper.h +++ b/binaries/benchmark_helper.h @@ -125,7 +125,7 @@ void writeOutput( const int num_blobs); void runNetwork( shared_ptr workspace, - caffe2::NetDef& net_def, + caffe2::NetBase* net, map& tensor_protos_map, const bool wipe_cache, const bool run_individual, @@ -149,6 +149,7 @@ int benchmark( const string& FLAGS_input_file, const string& FLAGS_input_type, int FLAGS_iter, + bool FLAGS_measure_memory, const string& FLAGS_net, const string& FLAGS_output, const string& FLAGS_output_folder, diff --git a/binaries/caffe2_benchmark.cc b/binaries/caffe2_benchmark.cc index 57bd98d49723c..19ef223bdea6c 100644 --- a/binaries/caffe2_benchmark.cc +++ b/binaries/caffe2_benchmark.cc @@ -22,6 +22,7 @@ int main(int argc, char** argv) { FLAGS_input_file, FLAGS_input_type, FLAGS_iter, + FLAGS_measure_memory, FLAGS_net, FLAGS_output, FLAGS_output_folder, diff --git a/modules/observers/net_observer_reporter.h b/modules/observers/net_observer_reporter.h index 5619b69a636e7..bfccef64cee2b 100644 --- a/modules/observers/net_observer_reporter.h +++ b/modules/observers/net_observer_reporter.h @@ -19,6 +19,7 @@ struct PerformanceInformation { std::string type = ""; // the type of the operator // Measured double latency = 0; + double cpuMilliseconds = 0; }; class CAFFE2_OBSERVER_API NetObserverReporter { diff --git a/modules/observers/net_observer_reporter_print.cc b/modules/observers/net_observer_reporter_print.cc index e00fe8c1fcc10..6baa7e01728dc 100644 --- a/modules/observers/net_observer_reporter_print.cc +++ b/modules/observers/net_observer_reporter_print.cc @@ -1,5 +1,6 @@ #include "observers/net_observer_reporter_print.h" +#include #include #include "caffe2/core/init.h" #include "observers/observer_config.h" @@ -9,6 +10,7 @@ namespace caffe2 { const std::string NetObserverReporterPrint::IDENTIFIER = "Caffe2Observer "; static std::string get_op_args(PerformanceInformation p); static std::string get_tensor_shapes(PerformanceInformation p); +static std::string sanatize(std::string json_s); void NetObserverReporterPrint::report( NetBase* net, @@ -23,29 +25,49 @@ void NetObserverReporterPrint::report( {"value", c10::to_string(p.second.latency * 1000)}, {"unit", "us"}, {"metric", "latency"}}); + caffe2_perf.push_back({{"type", "NET_"}, + { + "value", + c10::to_string( + p.second.cpuMilliseconds / + p.second.latency * + 100), + }, + {"unit", "percent"}, + {"metric", "cpu_percent"}}); } else if (p.first != "NET_DELAY") { // for operator perf std::string shape_str = get_tensor_shapes(p.second); std::string args_str = get_op_args(p.second); - - caffe2_perf.push_back({{"type", p.first}, + std::string type = p.first; + caffe2_perf.push_back({{"type", type}, {"value", c10::to_string(p.second.latency * 1000)}, {"unit", "us"}, {"metric", "latency"}}); + caffe2_perf.push_back({{"type", type}, + { + "value", + c10::to_string( + p.second.cpuMilliseconds / + p.second.latency * + 100), + }, + {"unit", "percent"}, + {"metric", "cpu_percent"}}); if (p.second.flops > 0) { - caffe2_perf.push_back({{"type", p.first}, + caffe2_perf.push_back({{"type", type}, {"value", c10::to_string(p.second.flops)}, {"unit", "flop"}, {"metric", "flops"}}); } if (shape_str != "") { - caffe2_perf.push_back({{"type", p.first}, + caffe2_perf.push_back({{"type", type}, {"info_string", shape_str}, {"unit", ""}, {"metric", "tensor_shapes"}}); } if (args_str != "") { - caffe2_perf.push_back({{"type", p.first}, + caffe2_perf.push_back({{"type", type}, {"info_string", args_str}, {"unit", ""}, {"metric", "op_args"}}); @@ -57,13 +79,13 @@ void NetObserverReporterPrint::report( std::stringstream buffer; auto entry = *it; buffer << IDENTIFIER << "{"; - buffer << "\"type\": \"" << entry["type"] << "\"," - << "\"unit\": \"" << entry["unit"] << "\"," - << "\"metric\": \"" << entry["metric"] << "\","; + buffer << "\"type\": \"" << sanatize(entry["type"]) << "\"," + << "\"unit\": \"" << sanatize(entry["unit"]) << "\"," + << "\"metric\": \"" << sanatize(entry["metric"]) << "\","; if (entry.find("value") != entry.end()) { - buffer << "\"value\": \"" << entry["value"] << "\""; + buffer << "\"value\": \"" << sanatize(entry["value"]) << "\""; } else if (entry.find("info_string") != entry.end()) { - buffer << "\"info_string\": \"" << entry["info_string"] << "\""; + buffer << "\"info_string\": \"" << sanatize(entry["info_string"]) << "\""; } buffer << "}"; LOG(INFO) << buffer.str(); @@ -117,4 +139,12 @@ static std::string get_op_args(PerformanceInformation p) { } return args_str; } + +static std::string sanatize(std::string json_s) { + // Remove illegal characters from the name that would cause json string to + // become invalid + json_s.erase(std::remove(json_s.begin(), json_s.end(), '"'), json_s.end()); + json_s.erase(std::remove(json_s.begin(), json_s.end(), '\\'), json_s.end()); + return json_s; +} } diff --git a/modules/observers/perf_observer.cc b/modules/observers/perf_observer.cc index 2de1ce65114f5..9e60179832a44 100644 --- a/modules/observers/perf_observer.cc +++ b/modules/observers/perf_observer.cc @@ -6,6 +6,10 @@ #endif #include +#include +#ifdef _WIN32 +#include +#endif #include "caffe2/core/common.h" #include "caffe2/core/init.h" #include "caffe2/core/operator.h" @@ -62,6 +66,75 @@ bool registerGlobalPerfNetObserverCreator(int* /*pargc*/, char*** /*pargv*/) { } } // namespace +#ifdef _WIN32 +double getTicksPerMillisecond() { + static LARGE_INTEGER ticks_per_sec; + if (!ticks_per_sec.QuadPart) { + QueryPerformanceFrequency(&ticks_per_sec); + if (!ticks_per_sec.QuadPart) { + return 0.0; + } + } + + return static_cast(ticks_per_sec.QuadPart) / 1000.0; +} +#else +double getClockTimeMilliseconds(clockid_t clk_id) { + int result; + struct timespec tp; + result = clock_gettime(clk_id, &tp); + if (result == -1) { + return 0.0; + } else { + return tp.tv_sec * 1000.0 + tp.tv_nsec / 1000000.0; + } +} +#endif + +double getWallClockTimeMilliseconds() { +#ifdef _WIN32 + double ticks_per_ms = getTicksPerMillisecond(); + if (ticks_per_ms) { + LARGE_INTEGER ticks; + if (QueryPerformanceCounter(&ticks)) { + return static_cast(ticks.QuadPart) / ticks_per_ms; + } + } + + return 0.0; +#else + return getClockTimeMilliseconds(CLOCK_MONOTONIC); +#endif +} + +double getCpuTimeMilliseconds() { +#ifdef _WIN32 + FILETIME creation_time; + FILETIME exit_time; + FILETIME kernel_time; + FILETIME user_time; + if (GetProcessTimes( + GetCurrentProcess(), + &creation_time, + &exit_time, + &kernel_time, + &user_time)) { + ULARGE_INTEGER kernel; + ULARGE_INTEGER user; + kernel.HighPart = kernel_time.dwHighDateTime; + kernel.LowPart = kernel_time.dwLowDateTime; + user.HighPart = user_time.dwHighDateTime; + user.LowPart = user_time.dwLowDateTime; + return (static_cast(kernel.QuadPart) + + static_cast(user.QuadPart)) / 10000.0; + } + + return 0.0; +#else + return getClockTimeMilliseconds(CLOCK_PROCESS_CPUTIME_ID); +#endif +} + REGISTER_CAFFE2_EARLY_INIT_FUNCTION( registerGlobalPerfNetObserverCreator, ®isterGlobalPerfNetObserverCreator, @@ -107,30 +180,31 @@ void PerfNetObserver::Start() { } } - if (logType_ != PerfNetObserver::NONE) { - /* Only start timer when we need to */ - timer_.Start(); - } + wallMilliseconds_ = getWallClockTimeMilliseconds(); + cpuMilliseconds_ = getCpuTimeMilliseconds(); } void PerfNetObserver::Stop() { if (logType_ == PerfNetObserver::NONE) { return; } - auto currentRunTime = timer_.MilliSeconds(); std::map info; PerformanceInformation net_perf; - net_perf.latency = currentRunTime; + net_perf.cpuMilliseconds = + getCpuTimeMilliseconds() - cpuMilliseconds_; + net_perf.latency = + getWallClockTimeMilliseconds() - wallMilliseconds_; + if (logType_ == PerfNetObserver::OPERATOR_DELAY) { const auto& operators = subject_->GetOperators(); for (int idx = 0; idx < operators.size(); ++idx) { const auto* op = operators[idx]; auto name = getObserverName(op, idx); PerformanceInformation p; - - p.latency = static_cast(observerMap_[op]) - ->getMilliseconds(); - + const PerfOperatorObserver* opObserver = + static_cast(observerMap_[op]); + p.latency = opObserver->getWallMilliseconds(); + p.cpuMilliseconds = opObserver->getCpuMilliseconds(); p.engine = op->engine(); p.type = op->type(); p.tensor_shapes = @@ -176,30 +250,34 @@ PerfOperatorObserver::PerfOperatorObserver( PerfNetObserver* netObserver) : ObserverBase(op), netObserver_(netObserver), - milliseconds_(0) { + wallMilliseconds_(0), + cpuMilliseconds_(0) { CAFFE_ENFORCE(netObserver_, "Observers can't operate outside of the net"); } PerfOperatorObserver::~PerfOperatorObserver() {} void PerfOperatorObserver::Start() { - /* Get the time from the start of the net minus the time spent - in previous invocations. It is the time spent on other operators. - This way, when the operator finishes, the time from the start of the net - minus the time spent in all other operators is the total time on this - operator. This is done to avoid saving a timer in each operator */ - milliseconds_ = netObserver_->getTimer().MilliSeconds() - milliseconds_; + wallMilliseconds_ = getWallClockTimeMilliseconds(); + cpuMilliseconds_ = getCpuTimeMilliseconds(); } void PerfOperatorObserver::Stop() { /* Time from the start of the net minus the time spent on all other operators is the time spent on this operator */ - milliseconds_ = netObserver_->getTimer().MilliSeconds() - milliseconds_; + cpuMilliseconds_ = + getCpuTimeMilliseconds() - cpuMilliseconds_; + wallMilliseconds_ = + getWallClockTimeMilliseconds() - wallMilliseconds_; tensor_shapes_ = subject_->InputTensorShapes(); } -double PerfOperatorObserver::getMilliseconds() const { - return milliseconds_; +double PerfOperatorObserver::getWallMilliseconds() const { + return wallMilliseconds_; +} + +double PerfOperatorObserver::getCpuMilliseconds() const { + return cpuMilliseconds_; } std::vector PerfOperatorObserver::getTensorShapes() const { diff --git a/modules/observers/perf_observer.h b/modules/observers/perf_observer.h index 11fb870a61961..71e1190e840ba 100644 --- a/modules/observers/perf_observer.h +++ b/modules/observers/perf_observer.h @@ -10,16 +10,13 @@ namespace caffe2 { +double getClockTimeMilliseconds(); class CAFFE2_OBSERVER_API PerfNetObserver : public NetObserver { public: explicit PerfNetObserver(NetBase* subject_); virtual ~PerfNetObserver(); - caffe2::Timer& getTimer() { - return timer_; - } - private: void Start() override; void Stop() override; @@ -37,7 +34,8 @@ class CAFFE2_OBSERVER_API PerfNetObserver : public NetObserver { std::unordered_map*> observerMap_; - caffe2::Timer timer_; + double wallMilliseconds_; + double cpuMilliseconds_; }; class PerfOperatorObserver : public ObserverBase { @@ -45,7 +43,8 @@ class PerfOperatorObserver : public ObserverBase { PerfOperatorObserver(OperatorBase* op, PerfNetObserver* netObserver); virtual ~PerfOperatorObserver(); - double getMilliseconds() const; + double getWallMilliseconds() const; + double getCpuMilliseconds() const; std::vector getTensorShapes() const; private: @@ -60,7 +59,8 @@ class PerfOperatorObserver : public ObserverBase { // without storing inside the operator observer. Each field is memory // costly here and a raw pointer is a cheapest sholution PerfNetObserver* netObserver_; - double milliseconds_; + double wallMilliseconds_; + double cpuMilliseconds_; std::vector tensor_shapes_; }; } // namespace caffe2