tools/android/loading/core_set.py

# Copyright 2016 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""Compute core set for a page.

This script is a collection of utilities for working with core sets.
"""

import argparse
import glob
import json
import logging
import multiprocessing
import os
import sys

import dependency_graph
import loading_trace
import request_dependencies_lens
import resource_sack


def _Progress(x):
  sys.stderr.write(x + '\n')


def _PageCore(prefix, graph_set_names, output):
  """Compute the page core over sets defined by graph_set_names."""
  assert graph_set_names
  graph_sets = []
  sack = resource_sack.GraphSack()
  for name in graph_set_names:
    name_graphs = []
    _Progress('Processing %s' % name)
    for filename in glob.iglob('-'.join([prefix, name, '*.trace'])):
      _Progress('Reading %s' % filename)
      trace = loading_trace.LoadingTrace.FromJsonFile(filename)
      graph = dependency_graph.RequestDependencyGraph(
          trace.request_track.GetEvents(),
          request_dependencies_lens.RequestDependencyLens(trace))
      sack.ConsumeGraph(graph)
      name_graphs.append(graph)
    graph_sets.append(name_graphs)
  core = sack.CoreSet(*graph_sets)
  json.dump({'page_core': [{'label': b.label,
                            'name': b.name,
                            'count': b.num_nodes}
                           for b in core],
             'non_core': [{'label': b.label,
                           'name': b.name,
                           'count': b.num_nodes}
                          for b in sack.bags if b not in core],
             'threshold': sack.CORE_THRESHOLD},
            output, sort_keys=True, indent=2)
  output.write('\n')


def _DoSite(site, graph_sets, input_dir, output_dir):
  """Compute the appropriate page core for a site.

  Used by _Spawn.
  """
  _Progress('Doing %s on %s' % (site, '/'.join(graph_sets)))
  prefix = os.path.join(input_dir, site)
  with file(os.path.join(output_dir,
                         '%s-%s.json' % (site, '.'.join(graph_sets))),
            'w') as output:
    _PageCore(prefix, graph_sets, output)


def _DoSiteRedirect(t):
  """Unpack arguments for map call.

  Note that multiprocessing.Pool.map cannot use a lambda (as it needs to be
  serialized into the executing process).
  """
  _DoSite(*t)


def _Spawn(site_list_file, graph_sets, input_dir, output_dir, workers):
  """Spool site computation out to a multiprocessing pool."""
  with file(site_list_file) as site_file:
    sites = [l.strip() for l in site_file.readlines()]
  _Progress('Using sites:\n %s' % '\n '.join(sites))
  pool = multiprocessing.Pool(workers, maxtasksperchild=1)
  pool.map(_DoSiteRedirect, [(s, graph_sets, input_dir, output_dir)
                             for s in sites])


def _ReadCoreSet(filename):
  data = json.load(open(filename))
  return set(page['name'] for page in data['page_core'])


def _Compare(a_name, b_name, csv):
  """Compare two core sets."""
  a = _ReadCoreSet(a_name)
  b = _ReadCoreSet(b_name)
  result = (resource_sack.GraphSack.CoreSimilarity(a, b),
            '  Equal' if a == b else 'UnEqual',
            'a<=b' if a <= b else 'a!<b',
            'a>=b' if b <= a else 'a!>b')
  if csv:
    print '%s,%s,%s,%s' % result
  else:
    print '%.2f %s %s %s' % result


if __name__ == '__main__':
  logging.basicConfig(level=logging.ERROR)
  parser = argparse.ArgumentParser()
  subparsers = parser.add_subparsers()

  spawn = subparsers.add_parser(
      'spawn', help=('spawn page core set computation from a sites list.\n'
                     'A core set will be computed for each site by '
                     'combining all run indicies from site traces for each '
                     '--set, then computing the page core over the sets. '
                     'Assumes trace file names in form {input-dir}/'
                     '{site}-{set}-{run index}.trace'))
  spawn.add_argument('--sets', required=True,
                     help='sets to combine, comma-separated')
  spawn.add_argument('--sites', required=True, help='file containing sites')
  spawn.add_argument('--workers', default=8, type=int,
                     help=('number of parallel workers. Each worker seems to '
                           'use about 0.5-1G/trace when processing. Total '
                           'memory usage should be kept less than physical '
                           'memory for the job to run in a reasonable time'))
  spawn.add_argument('--input_dir', required=True,
                     help='trace input directory')
  spawn.add_argument('--output_dir', required=True,
                     help=('core set output directory. Each site will have one '
                           'JSON file generated listing the core set as well '
                           'as some metadata like the threshold used'))
  spawn.set_defaults(executor=lambda args:
                     _Spawn(site_list_file=args.sites,
                            graph_sets=args.sets.split(','),
                            input_dir=args.input_dir,
                            output_dir=args.output_dir,
                            workers=args.workers))

  page_core = subparsers.add_parser(
      'page_core',
      help=('compute page core set for a group of files of form '
            '{--prefix}{set}*.trace over each set in --sets'))
  page_core.add_argument('--sets', required=True,
                       help='sets to combine, comma-separated')
  page_core.add_argument('--prefix', required=True,
                           help='trace file prefix')
  page_core.add_argument('--output', required=True,
                           help='JSON output file name')
  page_core.set_defaults(executor=lambda args:
                         _PageCore(args.prefix, args.sets.split(','),
                                   file(args.output, 'w')))

  compare = subparsers.add_parser(
      'compare',
      help=('compare two core sets (as output by spawn, page_core or '
            'all_cores) using Jaccard index. Outputs on stdout'))
  compare.add_argument('--a', required=True, help='the first core set JSON')
  compare.add_argument('--b', required=True, help='the second core set JSON')
  compare.add_argument('--csv', action='store_true', help='output as CSV')
  compare.set_defaults(
      executor=lambda args:
      _Compare(args.a, args.b, args.csv))

  args = parser.parse_args()
  args.executor(args)