forked from chromium/chromium
-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_rendering_benchmark_with_gated_performance.py
executable file
·393 lines (330 loc) · 15 KB
/
run_rendering_benchmark_with_gated_performance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
#!/usr/bin/env python
# Copyright 2019 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Runs telemetry benchmarks on representative story tag.
This script is a wrapper around run_performance_tests.py to capture the
values of performance metrics and compare them with the acceptable limits
in order to prevent regressions.
Arguments used for this script are the same as run_performance_tests.py.
The name and some functionalities of this script should be adjusted for
use with other benchmarks.
"""
from __future__ import print_function
import argparse
import csv
import json
import numpy as np
import os
import sys
import time
import common
import run_performance_tests
# AVG_ERROR_MARGIN determines how much more the value of frame times can be
# compared to the recorded value (multiplier of upper limit).
AVG_ERROR_MARGIN = 1.1
# CI stands for confidence intervals. "ci_095"s recorded in the data is the
# recorded range between upper and lower CIs. CI_ERROR_MARGIN is the maximum
# acceptable ratio of calculated ci_095 to the recorded ones.
# TODO(behdadb) crbug.com/1052054
CI_ERROR_MARGIN = 1.5
METRIC_NAME = 'frame_times'
class ResultRecorder(object):
def __init__(self):
self.fails = 0
self.tests = 0
self.start_time = time.time()
self.output = {}
self.return_code = 0
self._failed_stories = set()
self._noisy_control_stories = set()
# Set of _noisy_control_stories keeps track of control tests which failed
# because of high noise values.
def set_tests(self, output):
self.output = output
self.fails = output['num_failures_by_type'].get('FAIL', 0)
self.tests = self.fails + output['num_failures_by_type'].get('PASS', 0)
def add_failure(self, name, benchmark, is_control=False):
self.output['tests'][benchmark][name]['actual'] = 'FAIL'
self.output['tests'][benchmark][name]['is_unexpected'] = True
self._failed_stories.add(name)
self.fails += 1
if is_control:
self._noisy_control_stories.add(name)
def remove_failure(self, name, benchmark, is_control=False,
invalidation_reason=None):
self.output['tests'][benchmark][name]['actual'] = 'PASS'
self.output['tests'][benchmark][name]['is_unexpected'] = False
self._failed_stories.remove(name)
self.fails -= 1
if is_control:
self._noisy_control_stories.remove(name)
if invalidation_reason:
self.add_invalidation_reason(name, benchmark, invalidation_reason)
def invalidate_failures(self, benchmark):
# The method is for invalidating the failures in case of noisy control test
for story in self._failed_stories.copy():
print(story + ' [Invalidated Failure]: The story failed but was ' +
'invalidated as a result of noisy control test.')
self.remove_failure(story, benchmark, False, 'Noisy control test')
def add_invalidation_reason(self, name, benchmark, reason):
self.output['tests'][benchmark][name]['invalidation_reason'] = reason
@property
def failed_stories(self):
return self._failed_stories
@property
def is_control_stories_noisy(self):
return len(self._noisy_control_stories) > 0
def get_output(self, return_code):
self.output['seconds_since_epoch'] = time.time() - self.start_time
self.output['num_failures_by_type']['PASS'] = self.tests - self.fails
self.output['num_failures_by_type']['FAIL'] = self.fails
if return_code == 1:
self.output['interrupted'] = True
plural = lambda n, s, p: '%d %s' % (n, p if n != 1 else s)
tests = lambda n: plural(n, 'test', 'tests')
print('[ PASSED ] ' + tests(self.tests - self.fails) + '.')
if self.fails > 0:
print('[ FAILED ] ' + tests(self.fails) + '.')
self.return_code = 1
return (self.output, self.return_code)
class RenderingRepresentativePerfTest(object):
def __init__(self, initialization_for_tests=False):
self.return_code = 0
# result_recorder for rerun, and non rerun
self.result_recorder = {
True: ResultRecorder(),
False: ResultRecorder()
}
if initialization_for_tests is True:
return
self.options = parse_arguments()
print (self.options)
self.benchmark = self.options.benchmarks
out_dir_path = os.path.dirname(self.options.isolated_script_test_output)
re_run_output_dir = os.path.join(out_dir_path, 're_run_failures')
self.output_path = {
True: os.path.join(
re_run_output_dir, self.benchmark, 'test_results.json'),
False: os.path.join(out_dir_path, self.benchmark, 'test_results.json')
}
self.results_path = {
True: os.path.join(
re_run_output_dir, self.benchmark, 'perf_results.csv'),
False: os.path.join(out_dir_path, self.benchmark, 'perf_results.csv')
}
re_run_test_output = os.path.join(re_run_output_dir,
os.path.basename(self.options.isolated_script_test_output))
self.set_platform_specific_attributes()
# The values used as the upper limit are the 99th percentile of the
# avg and ci_095 frame_times recorded by dashboard in the past 200
# revisions. If the value measured here would be higher than this value at
# least by 10 [AVG_ERROR_MARGIN] percent of upper limit, that would be
# considered a failure. crbug.com/953895
with open(
os.path.join(os.path.dirname(__file__),
'representative_perf_test_data',
'representatives_frame_times_upper_limit.json')
) as bound_data:
self.upper_limit_data = json.load(bound_data)[self.platform]
self.args = list(sys.argv)
# The first run uses all stories in the representative story tag, but for
# rerun we use only the failed stories.
self.args.extend(['--story-tag-filter', self.story_tag])
self.re_run_args = replace_arg_values(list(sys.argv), [
('--isolated-script-test-output', re_run_test_output)])
def parse_csv_results(self, csv_obj):
""" Parses the raw CSV data
Convers the csv_obj into an array of valid values for averages and
confidence intervals based on the described upper_limits.
Args:
csv_obj: An array of rows (dict) describing the CSV results
Raturns:
A dictionary which has the stories as keys and an array of confidence
intervals and valid averages as data.
"""
values_per_story = {}
for row in csv_obj:
# For now only frame_times is used for testing representatives'
# performance and cpu_wall_time_ratio is used for validation.
if row['name'] != METRIC_NAME and row['name'] != 'cpu_wall_time_ratio':
continue
story_name = row['stories']
if (story_name not in self.upper_limit_data):
continue
if story_name not in values_per_story:
values_per_story[story_name] = {
'averages': [],
'ci_095': [],
'cpu_wall_time_ratio': []
}
if row['name'] == METRIC_NAME and row['avg'] != '' and row['count'] != 0:
values_per_story[story_name]['ci_095'].append(float(row['ci_095']))
values_per_story[story_name]['averages'].append(float(row['avg']))
elif row['name'] == 'cpu_wall_time_ratio' and row['avg'] != '':
values_per_story[story_name]['cpu_wall_time_ratio'].append(
float(row['avg']))
return values_per_story
def compare_values(self, values_per_story, rerun=False):
""" Parses the raw CSV data
Compares the values in values_per_story with the upper_limit_data and
determines if the story passes or fails and updates the ResultRecorder.
Args:
values_per_story: An array of rows (dict) descriving the CSV results
rerun: Is this a rerun or initial run
"""
for story_name in values_per_story:
# The experimental stories will not be considered for failing the tests
if (self.is_experimental_story(story_name)):
continue
if len(values_per_story[story_name]['ci_095']) == 0:
print(('[ FAILED ] {}/{} has no valid values for {}. Check ' +
'run_benchmark logs for more information.').format(
self.benchmark, story_name, METRIC_NAME))
self.result_recorder[rerun].add_failure(story_name, self.benchmark)
continue
upper_limits = self.upper_limit_data
upper_limit_avg = upper_limits[story_name]['avg']
upper_limit_ci = upper_limits[story_name]['ci_095']
lower_limit_cpu_ratio = upper_limits[story_name]['cpu_wall_time_ratio']
measured_avg = np.mean(np.array(values_per_story[story_name]['averages']))
measured_ci = np.mean(np.array(values_per_story[story_name]['ci_095']))
measured_cpu_ratio = np.mean(np.array(
values_per_story[story_name]['cpu_wall_time_ratio']))
if (measured_ci > upper_limit_ci * CI_ERROR_MARGIN and
self.is_control_story(story_name)):
print(('[ FAILED ] {}/{} {} has higher noise ({:.3f}) ' +
'compared to upper limit ({:.3f})').format(
self.benchmark, story_name, METRIC_NAME, measured_ci,
upper_limit_ci))
self.result_recorder[rerun].add_failure(
story_name, self.benchmark, True)
elif (measured_avg > upper_limit_avg * AVG_ERROR_MARGIN):
if (measured_cpu_ratio >= lower_limit_cpu_ratio):
print(('[ FAILED ] {}/{} higher average {}({:.3f}) compared' +
' to upper limit ({:.3f})').format(self.benchmark, story_name,
METRIC_NAME, measured_avg, upper_limit_avg))
self.result_recorder[rerun].add_failure(story_name, self.benchmark)
else:
print(('[ OK ] {}/{} higher average {}({:.3f}) compared ' +
'to upper limit({:.3f}). Invalidated for low cpu_wall_time_ratio'
).format(self.benchmark, story_name, METRIC_NAME, measured_avg,
upper_limit_avg))
self.result_recorder[rerun].add_invalidation_reason(
story_name, self.benchmark, 'Low cpu_wall_time_ratio')
else:
print(('[ OK ] {}/{} lower average {}({:.3f}) compared ' +
'to upper limit({:.3f}).').format(self.benchmark, story_name,
METRIC_NAME, measured_avg, upper_limit_avg))
def interpret_run_benchmark_results(self, rerun=False):
with open(self.output_path[rerun], 'r+') as resultsFile:
initialOut = json.load(resultsFile)
self.result_recorder[rerun].set_tests(initialOut)
with open(self.results_path[rerun]) as csv_file:
csv_obj = csv.DictReader(csv_file)
values_per_story = self.parse_csv_results(csv_obj)
if not rerun:
# Clearing the result of run_benchmark and write the gated perf results
resultsFile.seek(0)
resultsFile.truncate(0)
self.compare_values(values_per_story, rerun)
def run_perf_tests(self):
self.return_code |= run_performance_tests.main(self.args)
self.interpret_run_benchmark_results(False)
if len(self.result_recorder[False].failed_stories) > 0:
# For failed stories we run_tests again to make sure it's not a false
# positive.
print('============ Re_run the failed tests ============')
all_failed_stories = '('+'|'.join(
self.result_recorder[False].failed_stories)+')'
# TODO(crbug.com/1055893): Remove the extra chrome categories after
# investigation of flakes in representative perf tests.
self.re_run_args.extend(
['--story-filter', all_failed_stories, '--pageset-repeat=3',
'--extra-chrome-categories=blink,blink_gc,gpu,v8,viz'])
self.return_code |= run_performance_tests.main(self.re_run_args)
self.interpret_run_benchmark_results(True)
for story_name in self.result_recorder[False].failed_stories.copy():
if story_name not in self.result_recorder[True].failed_stories:
self.result_recorder[False].remove_failure(story_name,
self.benchmark, self.is_control_story(story_name))
if self.result_recorder[False].is_control_stories_noisy:
# In this case all failures are reported as expected, and the number of
# Failed stories in output.json will be zero.
self.result_recorder[False].invalidate_failures(self.benchmark)
(
finalOut,
self.return_code
) = self.result_recorder[False].get_output(self.return_code)
with open(self.output_path[False], 'r+') as resultsFile:
json.dump(finalOut, resultsFile, indent=4)
with open(self.options.isolated_script_test_output, 'w') as outputFile:
json.dump(finalOut, outputFile, indent=4)
if self.result_recorder[False].is_control_stories_noisy:
assert self.return_code == 0
print('Control story has high noise. These runs are not reliable!')
return self.return_code
def is_control_story(self, story_name):
# The story tagged as control story in upper_limit_data, will be used to
# identify possible flake and invalidates the results.
return self.story_has_attribute_enabled(story_name, 'control')
def is_experimental_story(self, story_name):
# The story tagged as experimental story in upper_limit_data, will be used
# to gather the performance results, but the test would not be failed as
# a result of.
return self.story_has_attribute_enabled(story_name, 'experimental')
def story_has_attribute_enabled(self, story_name, attribute):
return (attribute in self.upper_limit_data[story_name] and
self.upper_limit_data[story_name][attribute] == True)
def set_platform_specific_attributes(self):
if self.benchmark == 'rendering.desktop':
# Linux does not have it's own specific representatives
# and uses the representatives chosen for windows.
if sys.platform == 'win32' or sys.platform.startswith('linux'):
self.platform = 'win'
self.story_tag = 'representative_win_desktop'
elif sys.platform == 'darwin':
self.platform = 'mac'
self.story_tag = 'representative_mac_desktop'
else:
self.return_code = 1
elif self.benchmark == 'rendering.mobile':
self.platform = 'android'
self.story_tag = 'representative_mobile'
else:
self.return_code = 1
def replace_arg_values(args, key_value_pairs):
for index in range(0, len(args)):
for (key, value) in key_value_pairs:
if args[index].startswith(key):
if '=' in args[index]:
args[index] = key + '=' + value
else:
args[index+1] = value
return args
def main():
test_runner = RenderingRepresentativePerfTest()
if test_runner.return_code == 1:
return 1
return test_runner.run_perf_tests()
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument('executable', help='The name of the executable to run.')
parser.add_argument(
'--benchmarks', required=True)
parser.add_argument(
'--isolated-script-test-output', required=True)
parser.add_argument(
'--isolated-script-test-perf-output', required=False)
return parser.parse_known_args()[0]
def main_compile_targets(args):
json.dump([], args.output)
if __name__ == '__main__':
# Conform minimally to the protocol defined by ScriptTest.
if 'compile_targets' in sys.argv:
funcs = {
'run': None,
'compile_targets': main_compile_targets,
}
sys.exit(common.run_script(sys.argv[1:], funcs))
sys.exit(main())