forked from chromium/chromium
-
Notifications
You must be signed in to change notification settings - Fork 0
/
suggest_owners.py
executable file
·356 lines (302 loc) · 12.5 KB
/
suggest_owners.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
#!/usr/bin/env python
# Copyright 2018 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
from __future__ import print_function
import argparse
import subprocess
import pickle
import os
from os import path
from datetime import date, timedelta
from collections import namedtuple, defaultdict, Counter
Commit = namedtuple('Commit', ['hash', 'author', 'commit_date', 'dirs'])
# dict mapping each subdirectory and author to the number of their commits and
# modifications in that directory
DIRECTORY_AUTHORS = defaultdict(dict)
# cache for directory owners for memoisation purposes
OWNERS_CACHE = {}
# filename for pickle cache
CACHE_FILENAME = 'suggest_owners.cache'
def _RunGitCommand(options, cmd_args):
repo_path = path.join(options.repo_path, '.git')
cmd = ['git', '--git-dir', repo_path] + cmd_args
print('>', ' '.join(cmd))
return subprocess.check_output(cmd)
def _ValidAuthor(author):
return author.find('@chromium.org') > -1 and author.find('roller') == -1
# Returns additions/deletions by a commit to a directory (and its descendants).
def getEditsForDirectory(commit, directory):
additions = deletions = 0
for commit_directory, (directory_additions, directory_deletions) \
in commit.dirs.items():
# check if commit_directory is same as or a descendant of directory
if isSubDirectory(directory, commit_directory):
additions += directory_additions
deletions += directory_deletions
return additions, deletions
# This propagates a commit touching a directory to also be touching all
# ancesstor directories.
def _PropagateCommit(options, commit):
touched_dirs = set()
# first get all the touched dirs and their ancestors
for directory in commit.dirs.iterkeys():
while directory != '':
touched_dirs.add(directory)
# get the parent directory
directory = path.dirname(directory)
# loop over them and calculate the edits per directory
for directory in touched_dirs:
author_commits, author_additions, author_deletions = \
DIRECTORY_AUTHORS[directory].get(commit.author, (0,0,0))
directory_additions, directory_deletions = \
getEditsForDirectory(commit, directory)
DIRECTORY_AUTHORS[directory][commit.author] = \
(author_commits + 1, author_additions + directory_additions,
author_deletions + directory_deletions)
# Checks if child_directory is same as or below parent_directory. For some
# reason the os.path module does not have this functionality.
def isSubDirectory(parent_directory, child_directory):
parent_directory = parent_directory + '/'
child_directory = child_directory + '/'
return child_directory.startswith(parent_directory)
def _GetGitLogCmd(options):
# TODO(mheikal): git-log with --numstat vs --name-only takes 10x the time to
# complete. It takes >15 mins for git log --numstat to return the 1 year git
# history of the full repo. Should probably add a script flag to switch off
# keeping track of number of modifications per commit.
date_limit = date.today() - timedelta(days=options.days_ago)
format_string = "%h,%ae,%cI"
cmd_args = [
'log',
'--since', date_limit.isoformat(),
'--numstat',
'--pretty=format:%s'%format_string,
]
# has to be last arg
if options.subdirectory:
cmd_args += ['--', options.subdirectory]
return cmd_args
def _ParseCommitLine(line):
commit_hash, author, commit_date = line.split(",")
return Commit(hash=commit_hash, author=author, commit_date=commit_date,
dirs={})
def _ParseFileStatsLine(current_commit, line):
try:
additions, deletions, filepath = line.split('\t')
except ValueError:
return False
if additions == '-':
additions = 0
else:
additions = int(additions)
if deletions == '-':
deletions = 0
else:
deletions = int(deletions)
dir_path = path.dirname(filepath)
commit_additions, commit_deletions = \
current_commit.dirs.get(dir_path, (0,0))
current_commit.dirs[dir_path] = (
additions + commit_additions, deletions + commit_deletions)
return True
def processAllCommits(options):
if not options.subdirectory and options.days_ago > 100:
print('git log for your query might take > 5 minutes, limit by a '
'subdirectory or reduce the number of days of history to low double '
'digits to make this faster. There is no progress indicator, it is '
'all waiting for single git log to finish.')
output = _RunGitCommand(options, _GetGitLogCmd(options))
current_commit = None
for line in output.splitlines():
if current_commit is None:
current_commit = _ParseCommitLine(line)
else:
if line == '': # all commit details read
if _ValidAuthor(current_commit.author):
_PropagateCommit(options, current_commit)
current_commit = None
else:
# Merge commits weird out git-log. If we fail to parse the line, then
# the last commit was a merge and this line is actually another commit
# description line.
if not _ParseFileStatsLine(current_commit, line):
current_commit = _ParseCommitLine(line)
# process the final commit
if _ValidAuthor(current_commit.author):
_PropagateCommit(options, current_commit)
def _CountCommits(directory):
return sum(
[count for (count, _a, _d) in DIRECTORY_AUTHORS[directory].itervalues()])
def _GetOwnerLevel(options, author, directory):
sorted_owners = sorted(_GetOwners(options, directory), key=lambda (o,l): l)
for owner, level in sorted_owners:
if author == owner:
return level
else:
return -1
# Returns the owners for a repo subdirectory. This does not understand per-file
# directives.
# TODO(mheikal): use depot_tools owners.py for parsing owners files.
def _GetOwners(options, directory_path):
if directory_path in OWNERS_CACHE:
return OWNERS_CACHE[directory_path]
owners_path = path.join(options.repo_path, directory_path, 'OWNERS')
owners = set()
parent_dir = directory_path
owner_level = 0
while parent_dir != '':
if path.isfile(owners_path):
parsed_owners, noparent = _ParseOwnersFile(options, owners_path)
owners.update([(owner, owner_level) for owner in parsed_owners])
owner_level += 1
if noparent:
break
parent_dir = path.dirname(parent_dir)
owners_path = path.join(parent_dir, 'OWNERS')
OWNERS_CACHE[directory_path] = set(owners)
return owners
# Parse an OWNERS file, returns set of owners and if the file sets noparent
def _ParseOwnersFile(options, filepath):
owners = set()
noparent = False
with open(filepath) as f:
for line in f.readlines():
line = line.strip()
# The script deals with directories so per-files are ignored.
if line == '' or line[0] == '#' or line.startswith('per-file'):
continue
if line.startswith('file://'):
relpath = line[7:]
abspath = path.join(options.repo_path, relpath)
parsed_owners, _ = _ParseOwnersFile(options, abspath)
owners.update(parsed_owners)
if line == 'set noparent':
noparent = True
index = line.find('@chromium.org')
if index > -1:
owners.add(line[:index + len('@chromium.org')])
return owners, noparent
# Trivial directories are ones that just contain a single child subdir and
# nothing else.
def _IsTrivialDirectory(options, repo_subdir):
try:
return len(os.listdir(path.join(options.repo_path, repo_subdir))) == 1
except OSError:
# directory no longer exists
return False
def computeSuggestions(options):
directory_suggestions = []
for directory, authors in sorted(
DIRECTORY_AUTHORS.iteritems(), key=lambda (d, a): d):
if _IsTrivialDirectory(options, directory):
continue
if _CountCommits(directory) < options.dir_commit_limit:
continue
# skip suggestions for directories outside the passed in directory
if (options.subdirectory
and not isSubDirectory(options.subdirectory, directory)):
continue
# sort authors by descending number of commits
sorted_authors = sorted(authors.items(),
key=lambda (author, details): -details[0])
# keep only authors above the limit
suggestions = [(a,c) for a,c in sorted_authors if \
a not in options.ignore_authors \
and c[0] >= options.author_cl_limit]
directory_suggestions.append((directory, suggestions))
return directory_suggestions
def _PrintSettings(options):
print('Showing directories with at least ({}) commits in the last ({}) '
'days.'.format(options.dir_commit_limit, options.days_ago))
print('Showing top ({}) committers who have commited at least ({}) commits '
'to the directory in the last ({}) days.'.format(
options.max_suggestions, options.author_cl_limit,
options.days_ago))
print('(owners+N) represents distance through OWNERS files for said owner\n')
def printSuggestions(options, directory_suggestions):
print('\nCommit stats:')
_PrintSettings(options)
for directory, suggestions in directory_suggestions:
print('{}: {} commits in the last {} days'.format(
directory, _CountCommits(directory), options.days_ago))
non_owner_suggestions = 0
for author, (commit_count, additions, deletions) in suggestions:
owner_level = _GetOwnerLevel(options, author, directory)
if owner_level > -1:
owner_string = ' (owner+{})'.format(owner_level)
else:
non_owner_suggestions +=1
owner_string = ''
print('{}{}, commits: {}, additions:{}, deletions: {}'.format(
author, owner_string, commit_count, additions, deletions))
if non_owner_suggestions >= options.max_suggestions:
break
print()
def _GetHeadCommitHash(options):
return _RunGitCommand(options, ['rev-parse', 'HEAD']).strip()
def _GetCacheMetadata(options):
return _GetHeadCommitHash(options), options.days_ago, options.subdirectory
def _IsCacheValid(options, metadata):
head_hash, days_ago, cached_subdirectory = metadata
if head_hash != _GetHeadCommitHash(options):
return False
if days_ago != options.days_ago:
return False
if (cached_subdirectory is not None
and not isSubDirectory(cached_subdirectory, options.subdirectory)):
return False
return True
def cacheProcessedCommits(options):
metadata = _GetCacheMetadata(options)
with open(CACHE_FILENAME, 'w') as f:
pickle.dump((metadata, DIRECTORY_AUTHORS), f)
def maybeRestoreProcessedCommits(options):
global DIRECTORY_AUTHORS
if not path.exists(CACHE_FILENAME):
return False
with open(CACHE_FILENAME) as f:
stored_metadata, cached_directory_authors = pickle.load(f)
if _IsCacheValid(options, stored_metadata):
print('Loading from cache')
DIRECTORY_AUTHORS = cached_directory_authors
return True
else:
print('Cache is stale or invalid, must rerun `git log`')
return False
def do(options):
if options.skip_cache or not maybeRestoreProcessedCommits(options):
processAllCommits(options)
cacheProcessedCommits(options)
directory_suggestions = computeSuggestions(options)
printSuggestions(options, directory_suggestions)
def main():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('repo_path')
parser.add_argument('--days-ago', type=int,
help='Number of days of history to search through.',
default=365, metavar='DAYS_AGO')
parser.add_argument('--subdirectory',
help='Limit suggestions to this subdirectory', default='')
parser.add_argument('--ignore-authors',
help='Ignore this comma separated list of authors')
parser.add_argument('--max-suggestions', type=int, help='Maximum number of '
'suggested authors per directory.', default=5)
parser.add_argument('--author-cl-limit', type=int, help='Do not suggest '
'authors who have commited less than this to the '
'directory in the last DAYS_AGO days.', default=10)
parser.add_argument('--dir-commit-limit', type=int, help='Skip directories '
'with less than this number of commits in the last '
'DAYS_AGO days.', default=100)
parser.add_argument('--skip-cache', action='store_true',
help='Do not read from cache.', default=False)
options = parser.parse_args()
if options.ignore_authors:
options.ignore_authors = set(
map(str.strip, options.ignore_authors.split(',')))
else:
options.ignore_authors = set()
do(options)
if __name__ == '__main__':
main()