forked from chromium/chromium
-
Notifications
You must be signed in to change notification settings - Fork 0
/
core_set.py
169 lines (143 loc) · 6.33 KB
/
core_set.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# Copyright 2016 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Compute core set for a page.
This script is a collection of utilities for working with core sets.
"""
import argparse
import glob
import json
import logging
import multiprocessing
import os
import sys
import dependency_graph
import loading_trace
import request_dependencies_lens
import resource_sack
def _Progress(x):
sys.stderr.write(x + '\n')
def _PageCore(prefix, graph_set_names, output):
"""Compute the page core over sets defined by graph_set_names."""
assert graph_set_names
graph_sets = []
sack = resource_sack.GraphSack()
for name in graph_set_names:
name_graphs = []
_Progress('Processing %s' % name)
for filename in glob.iglob('-'.join([prefix, name, '*.trace'])):
_Progress('Reading %s' % filename)
trace = loading_trace.LoadingTrace.FromJsonFile(filename)
graph = dependency_graph.RequestDependencyGraph(
trace.request_track.GetEvents(),
request_dependencies_lens.RequestDependencyLens(trace))
sack.ConsumeGraph(graph)
name_graphs.append(graph)
graph_sets.append(name_graphs)
core = sack.CoreSet(*graph_sets)
json.dump({'page_core': [{'label': b.label,
'name': b.name,
'count': b.num_nodes}
for b in core],
'non_core': [{'label': b.label,
'name': b.name,
'count': b.num_nodes}
for b in sack.bags if b not in core],
'threshold': sack.CORE_THRESHOLD},
output, sort_keys=True, indent=2)
output.write('\n')
def _DoSite(site, graph_sets, input_dir, output_dir):
"""Compute the appropriate page core for a site.
Used by _Spawn.
"""
_Progress('Doing %s on %s' % (site, '/'.join(graph_sets)))
prefix = os.path.join(input_dir, site)
with file(os.path.join(output_dir,
'%s-%s.json' % (site, '.'.join(graph_sets))),
'w') as output:
_PageCore(prefix, graph_sets, output)
def _DoSiteRedirect(t):
"""Unpack arguments for map call.
Note that multiprocessing.Pool.map cannot use a lambda (as it needs to be
serialized into the executing process).
"""
_DoSite(*t)
def _Spawn(site_list_file, graph_sets, input_dir, output_dir, workers):
"""Spool site computation out to a multiprocessing pool."""
with file(site_list_file) as site_file:
sites = [l.strip() for l in site_file.readlines()]
_Progress('Using sites:\n %s' % '\n '.join(sites))
pool = multiprocessing.Pool(workers, maxtasksperchild=1)
pool.map(_DoSiteRedirect, [(s, graph_sets, input_dir, output_dir)
for s in sites])
def _ReadCoreSet(filename):
data = json.load(open(filename))
return set(page['name'] for page in data['page_core'])
def _Compare(a_name, b_name, csv):
"""Compare two core sets."""
a = _ReadCoreSet(a_name)
b = _ReadCoreSet(b_name)
result = (resource_sack.GraphSack.CoreSimilarity(a, b),
' Equal' if a == b else 'UnEqual',
'a<=b' if a <= b else 'a!<b',
'a>=b' if b <= a else 'a!>b')
if csv:
print '%s,%s,%s,%s' % result
else:
print '%.2f %s %s %s' % result
if __name__ == '__main__':
logging.basicConfig(level=logging.ERROR)
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers()
spawn = subparsers.add_parser(
'spawn', help=('spawn page core set computation from a sites list.\n'
'A core set will be computed for each site by '
'combining all run indicies from site traces for each '
'--set, then computing the page core over the sets. '
'Assumes trace file names in form {input-dir}/'
'{site}-{set}-{run index}.trace'))
spawn.add_argument('--sets', required=True,
help='sets to combine, comma-separated')
spawn.add_argument('--sites', required=True, help='file containing sites')
spawn.add_argument('--workers', default=8, type=int,
help=('number of parallel workers. Each worker seems to '
'use about 0.5-1G/trace when processing. Total '
'memory usage should be kept less than physical '
'memory for the job to run in a reasonable time'))
spawn.add_argument('--input_dir', required=True,
help='trace input directory')
spawn.add_argument('--output_dir', required=True,
help=('core set output directory. Each site will have one '
'JSON file generated listing the core set as well '
'as some metadata like the threshold used'))
spawn.set_defaults(executor=lambda args:
_Spawn(site_list_file=args.sites,
graph_sets=args.sets.split(','),
input_dir=args.input_dir,
output_dir=args.output_dir,
workers=args.workers))
page_core = subparsers.add_parser(
'page_core',
help=('compute page core set for a group of files of form '
'{--prefix}{set}*.trace over each set in --sets'))
page_core.add_argument('--sets', required=True,
help='sets to combine, comma-separated')
page_core.add_argument('--prefix', required=True,
help='trace file prefix')
page_core.add_argument('--output', required=True,
help='JSON output file name')
page_core.set_defaults(executor=lambda args:
_PageCore(args.prefix, args.sets.split(','),
file(args.output, 'w')))
compare = subparsers.add_parser(
'compare',
help=('compare two core sets (as output by spawn, page_core or '
'all_cores) using Jaccard index. Outputs on stdout'))
compare.add_argument('--a', required=True, help='the first core set JSON')
compare.add_argument('--b', required=True, help='the second core set JSON')
compare.add_argument('--csv', action='store_true', help='output as CSV')
compare.set_defaults(
executor=lambda args:
_Compare(args.a, args.b, args.csv))
args = parser.parse_args()
args.executor(args)