Skip to content

Commit

Permalink
json.dumps() for author balance and TFIDF visualizations
Browse files Browse the repository at this point in the history
  • Loading branch information
candu committed Dec 30, 2012
1 parent c553ee0 commit d883573
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 11 deletions.
27 changes: 18 additions & 9 deletions analytics/authors.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from utils import *

sys.stdout = codecs.getwriter('UTF-8')(sys.stdout)
D = []
words = {EVAN: Counter(), VALKYRIE: Counter()}
normalizer = createNormalizer(
allow_nonalpha=False,
Expand All @@ -15,14 +16,22 @@
data = json.loads(line)
data['content'] = etree.HTML(data['content'])
author = identifyAuthor(data)
words[author].update(extractWords(data['content'], normalizer))
doc = extractWords(data['content'], normalizer)
words[author].update(doc)
D.append(doc)
totals = words[EVAN] + words[VALKYRIE]
E = []
idf = IDF(D)
out = []
for W, N in totals.iteritems():
if N < 5 or len(W) < 2:
continue
e = words[EVAN][W] / float(N)
E.append((W, N, e))
E.sort(key=lambda x: (x[2], x[1], x[0]))
for W, N, e in E:
print '%30s %10d %10.4f %10.4f' % (W, N, e, 1.0 - e)
E = words[EVAN][W]
out.append({
'word' : W,
'count' : {
'evan' : E,
'valkyrie' : N - E,
'total' : N
},
'idf': idf[W]
})

print json.dumps(out)
6 changes: 4 additions & 2 deletions analytics/tfidf.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,14 @@
for line in sys.stdin:
data = json.loads(line)
data['content'] = etree.HTML(data['content'])
D[data['path']] = extractWords(data['content'], normalizer)
D[data['href']] = extractWords(data['content'], normalizer)
idf = IDF(D.values())
out = {}
for k, d in sorted(D.iteritems()):
tf = TF(d, a=0.4)
r = [(W, tf[W] * idf[W]) for W in tf]
r.sort(key=lambda x: (x[1], x[0]), reverse=True)
gs = itertools.groupby(r, key=lambda x: x[1])
gs = itertools.imap(lambda g: (g[0], [x[0] for x in g[1]]), gs)
print json.dumps({k: list(itertools.islice(gs, 3))})
out[k] = list(gs)
print json.dumps(out)

0 comments on commit d883573

Please sign in to comment.