Add some doc files and utilities

huichen · Feb 20, 2010 · a551a6e · a551a6e
1 parent 95cc4a3
commit a551a6e
Show file tree

Hide file tree

Showing 5 changed files with 394 additions and 0 deletions.
diff --git a/bin/report_mutex.php b/bin/report_mutex.php
@@ -0,0 +1,49 @@
+<?php
+
+$server = $argv[1];
+$top = $argv[2];
+$translate = $argv[3];
+if (!$top) $top = 20;
+
+$ret = shell_exec("GET 'http://$server/stats.kvp?agg=*&keys=:mutex.*:'");
+$stats = json_decode($ret);
+if (!$stats) {
+  exit("No mutex profile data was found on server\n");
+}
+
+foreach ($stats as $name => $count) {
+  if (preg_match('/mutex.([0-9a-f:]+).(hit|time)/', $name, $m)) {
+    $stack = $m[1];
+    $type = $m[2];
+
+    if ($type == 'hit') {
+      $hits[$stack] = $count;
+    } else {
+      $times[$stack] = $count;
+    }
+  }
+}
+
+arsort($hits); $hits = array_slice($hits, 0, $top);
+arsort($times); $times = array_slice($times, 0, $top);
+
+$thits = array();
+print str_repeat('=', 70)."\n";
+foreach ($hits as $stack => $count) {
+  print $count ." x sampling hits:\n";
+  print $translate ? translate_stack($stack) : $stack."\n";
+  print str_repeat('-', 70)."\n";
+}
+$ttimes = array();
+print str_repeat('=', 70)."\n";
+foreach ($times as $stack => $count) {
+  print (int)($count/1000000) ." seconds:\n";
+  print $translate ? translate_stack($stack) : $stack."\n";
+  print str_repeat('-', 70)."\n";
+}
+
+function translate_stack($stack) {
+  global $server;
+  return shell_exec("GET http://$server/translate?stack=$stack");
+}
+
diff --git a/doc/command.admin_server b/doc/command.admin_server
@@ -0,0 +1,68 @@
+<h2>Admin Server URL Commands</h2>
+
+When running a compiled program as an HTTP server, by default it runs an
+admin server on a specified port. One can send an HTTP request to this port
+to perform certain actions. To list all possible commands,
+
+  GET http://localhost:9999
+
+This is a list of available URLs:
+
+
+/stop:            stop the web server
+/translate:       translate hex encoded stacktrace in 'stack' param
+    stack         required, stack trace to translate
+    build-id      optional, if specified, build ID has to match
+    bare          optional, whether to display frame ordinates
+/build-id:        returns build id that's passed in from command line
+/check-load:      how many threads are actively handling requests
+/check-mem:       report memory quick statistics in log file
+/check-apc:       report APC quick statistics
+/status.xml:      show server status in XML
+/status.json:     show server status in JSON
+/status.html:     show server status in HTML
+/stats-on:        main switch: enable server stats
+/stats-off:       main switch: disable server stats
+/stats-clear:     clear all server stats
+/stats-web:       turn on/off server page stats (CPU and gen time)
+/stats-mem:       turn on/off memory statistics
+/stats-apc:       turn on/off APC statistics
+/stats-apc-key:   turn on/off APC key statistics
+/stats-mcc:       turn on/off memcache statistics
+/stats-sql:       turn on/off SQL statistics
+/stats-mutex:     turn on/off mutex statistics
+    sampling      optional, default 1000
+/stats.keys:      list all available keys
+    from          optional, <timestamp>, or <-n> second ago
+    to            optional, <timestamp>, or <-n> second ago
+/stats.xml:       show server stats in XML
+/stats.json:      show server stats in JSON
+/stats.kvp:       show server stats in key-value pairs
+/stats.html:      show server stats in HTML
+    from          optional, <timestamp>, or <-n> second ago
+    to            optional, <timestamp>, or <-n> second ago
+    agg           optional, aggragation: *, url, code
+    keys          optional, <key>,<key/hit>,<key/sec>,<:regex:>
+    url           optional, only stats of this page or URL
+    code          optional, only stats of pages returning this code
+
+If program was compiled with GOOGLE_CPU_PROFILER, these commands will become available,
+
+/prof-cpu-on:     turn on CPU profiler
+/prof-cpu-off:    turn off CPU profiler
+
+If program was compiled with GOOGLE_HEAP_PROFILER, these commands will become available,
+
+/prof-heap-on:    turn on heap profiler
+/prof-heap-dump:  take one snapshot of the heap
+/prof-heap-off:   turn off heap profiler
+/stats-malloc:    turn on/off malloc statistics
+/leak-on:         start leak detection
+    sampling      required, frequency
+/leak-off:        end leak detection and report leaking
+    cutoff        optional, default 20 seconds, ignore newer allocs
+
+If program was compiled with GOOGLE_TCMALLOC, these commands will become available,
+
+/free-mem:        ask tcmalloc to release memory to system
+/tcmalloc-stats:  get internal tcmalloc stats
diff --git a/doc/debug.leak b/doc/debug.leak
@@ -0,0 +1,62 @@
+9999
+<h2>Debugging Memory Leaks</h2>
+
+First of all, we need unit tests to verify different classes and functions
+(esp. extension functions) don't have memory leaks by running under valgrind
+like this:
+
+  GLIBCXX_FORCE_NEW=1 \
+  valgrind --suppressions=../bin/valgrind.suppression --tool=memcheck \
+        --leak-check=full --num-callers=30 --max-stackframe=3000000 \
+        test/test TestExtFoo::test_ext_bar
+
+When it comes to server running, it becomes impossible to run valgrind or
+heap profiler that slows down request handling very much. Here's the procedure
+to run built-in memory leak detection against a live server:
+
+1. Turn on heap profiler
+
+Build the server (both HPHP and www) with modification of rules.mk:
+
+  DEBUG=1
+  #GOOGLE_CPU_PROFILER = 1
+  GOOGLE_HEAP_PROFILER = 1
+
+This turns off CPU profiler and turns on heap profiler that gives us malloc()
+hooks for our own sampling based leak detection. We also need to turn on DEBUG
+to generate readable stacktraces.
+
+2. Turn off mt_allocator
+
+Run server with <b>GLIBCXX_FORCE_NEW=1</b>. This environment variable turns
+off STL's mt_allocator, which doesn't call free() when some STL objects are
+destructed.
+
+3. Initialize long-living objects
+
+Let the server run for a few minutes, until APC is mostly updated. Otherwise,
+APC objects may be reported as leaked items.
+
+4. Turn on leak detection
+
+Hit the server to turn on leak detection:
+
+  GET http://[server]:9999/leak-on?sampling=500
+
+The higher the sampling rate, the least impact leak detection has on server
+running, but it will take longer to collect leaked items. 500 is a good rate
+in our debugging process.
+
+5. Report leaks
+
+Wait for minutes long, or even hours long, depending on how rare the leak
+happens. Then hit the server to turn off leak detection and to report leaks:
+
+  GET http://[server]:9999/leak-off > leak_report
+
+6. Examine output
+
+The output should have all leaked items. Sometimes some stacks are not
+fully translated, and a manual translation needs to be done like this:
+
+  ./www --mode translate <hex-coded-stacktrace>
diff --git a/doc/debug.mutex b/doc/debug.mutex
@@ -0,0 +1,26 @@
+
+<h2>Debugging Excessive Mutex</h2>
+
+1. Trun on mutex stats
+
+Hit admin port with /stats-mutex to turn on mutex stats:
+
+   GET http://localhost:9999/stats-mutex
+
+2. Query mutex stats
+
+Get mutex stats like this,
+
+   GET "http://localhost:9999/stats.kvp?agg=*&keys=:mutex.*:"
+
+3. Pre-written script
+
+Or, run bin/ report stats,
+
+   php ../bin/report_mutex.php localhost 10 1
+
+4. Turn off mutex stats
+
+Hit admin port with /stats-mutex to turn off mutex stats:
+
+   GET http://localhost:9999/stats-mutex
diff --git a/doc/stats b/doc/stats
@@ -0,0 +1,189 @@
+
+<h2>Server Stats</h2>
+
+For each page, we collect stats by time slots. Each time slot is configured as
+StatsSlotDuration seconds and server internally keeps StatsMaxSlot number of
+slots. Inside each slot, we keep a set of stats by page or URL. These stats
+include 3 built-in ones ("url", "code" and "hit") and many key-value pairs
+defined by different parts of the system.
+
+  slot:
+    time:
+    pages:
+      page:
+        url:   original URL
+        code:  return code
+        hit:   total counts
+        details:
+          key-value pair
+          key-value pair
+          key-value pair
+          ...
+
+
+<h2>Stats Query</h2>
+
+To query stats, hit admin port with a URL like this,
+
+  http://[server]:9999/stats.[fmt]?from=[t1]&to=[t2]...
+
+from: (optional) starting time's timestamp (e.g. 1251927393),
+        - use -n for n seconds ago
+        - when omitted or 0, it will be the earliest possible time server keeps
+
+to:   (optional) ending time's timestamp,
+        - use -n for n seconds ago
+        - when omitted or 0, it will be "now"
+
+agg:  (optional) aggregation, can be any one of these,
+        *          aggregate all data into one list of key value pairs
+        url        aggregate all data by URLs
+        code       aggregate all data by response code
+        (omitted)  default by time slots
+
+keys: (optional) comma delimited keys to query, each of which can be decorated
+        [key]      just the key's value, e.g. "sql.conn"
+        [key]/hit  average per page hit, e.g. "sql.conn/hit"
+        [key]/sec  per second rate, e.g. "sql.conn/sec"
+        #[regex]#  keys matching the regular expression
+        (omitted)  all available keys
+
+url:  (optional) only output stats matching the specified URL
+
+code: (optional) only output stats of pages that have response code
+
+[fmt]: can be one of these:
+
+        xml        XML format
+        json       JSON format
+        kvp        simple key-value pairs in JSON format, assuming agg=*
+
+
+<h2>Available Keys</h2>
+
+1. SQL Stats:
+
+(1) Connections
+
+sql.conn:       number of connections newly created
+sql.reconn_new: number of connections newly created when trying to reconnect
+sql.reconn_ok:  number of connections re-picked up when trying to reconnect
+sql.reconn_old: number of connections dropped when trying to reconnect
+
+(2) Queries
+
+sql.query:                number of queries executed
+sql.query.[table].[verb]: per table-verb stats
+sql.query.[verb]:         per verb stats, where [verb] can be one of these:
+
+- select
+- insert
+- update
+- replace
+- delete
+- begin
+- commit
+- rollback
+- unknown
+
+2. MemCache Stats:
+
+mcc.madd:           number of multi_add() calls
+mcc.madd.count:     total count of multi added keys
+mcc.mreplace:       number of multi_replace() calls
+mcc.mreplace.count: total count of multi replaced keys
+mcc.set:            number of set() calls
+mcc.add:            number of add() calls
+mcc.decr:           number of decr() calls
+mcc.incr:           number of incr() calls
+mcc.delete:         number of delete() calls
+mcc.delete_details: number of delete_details() calls
+mcc.get:            number of get() calls
+mcc.mget:           number of multi_get() calls
+mcc.mget.count:     total count of multi got keys
+mcc.replace:        number of replace() calls
+mcc.set:            number of set() calls
+mcc.stats:          number of stats() calls
+
+3. APC Stats:
+
+apc.miss:   number of item misses
+apc.hit:    number of item hits
+apc.update: number of item updates
+apc.new:    number of new items
+apc.erased: number of successfully erased items
+apc.erase:  number of items that failed to erase (because they were absent)
+apc.inc:    number of inc() call
+apc.cas:    number of cas() call
+
+4. Memory Stats:
+
+mem.[type].[size].alloc: total number of objects allocated of the type
+mem.[type].[size].freed: total number of objects freed of the type
+
+These two stats are only available when Google heap profler is turned on for
+debugging purposes:
+
+mem.malloc.peak:   peak malloc()-ed memory
+mem.malloc.leaked: leaked malloc()-ed memory
+
+5. Page Sections:
+
+page.wall.[section]:   wall time a page section takes
+page.cpu.[section]:    CPU time a page section takes
+mem.[section]:         SmartAllocator memory a page section takes
+network.uncompressed:  total bytes to be sent before compression
+network.compressed:    total bytes sent after compression
+
+Section can be one of these:
+
+- queuing
+- all
+- input
+- invoke
+- send
+- psp
+- rollback
+- free
+
+6. evhttp Stats:
+
+- evhttp.hit:             used cached connection
+- evhttp.hit.<address>    used cached connection by URL
+- evhttp.miss             no cached connection available
+- evhttp.miss.<address>   no cached connection available by URL
+- evhttp.close            cached connection got closed
+- evhttp.close.<address>  cached connection got closed by URL
+- evhttp.skip             not set to use cached connection
+- evhttp.skip.<address>   not set to use cached connection by URL
+
+7. Application Stats:
+
+PHP page can collect application-defined stats by calling
+
+  hphp_stats($key, $count);
+
+where $key is arbitrary and $count will be tallied across different calls of
+the same key.
+
+8. Special Keys:
+
+hit:   page hit
+load:  number of active worker threads
+idle:  number of idle worker threads
+
+
+<h2>Example URL</h2>
+
+  GET "http://localhost:9999/stats.kvp?prefix=hphp&agg=*" \
+    "&keys=apc.hit/sec,hit,load,:sql.query..*.select:," \
+    "network.compressed/hit,hit/sec"
+
+This URL queries the following data:
+
+hit:                        page hits
+hit/sec:                    request per second
+apc.hit/sec:                APC hit per second
+load:                       number of active threads currently
+network.compressed/hit:     sent bytes per request
+:sql.query..*.select:       all SELECTs on different tables