-
Notifications
You must be signed in to change notification settings - Fork 0
/
twitteraae.patch.diff
54 lines (48 loc) · 1.87 KB
/
twitteraae.patch.diff
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
diff --git a/code/predict.py b/code/predict.py
index a287d67..0f67f5e 100644
--- a/code/predict.py
+++ b/code/predict.py
@@ -1,6 +1,5 @@
from __future__ import division
import numpy as np
-import sys,os
vocabfile = "../model/model_vocab.txt"
modelfile = "../model/model_count_table.txt"
@@ -20,7 +19,7 @@ def load_model():
K = len(N_k)
wordprobs = (N_wk + 1) / N_k
- vocab = [L.split("\t")[-1].strip().decode("utf-8") for L in open(vocabfile)]
+ vocab = [L.split("\t")[-1].strip() for L in open(vocabfile)]
w2num = {w:i for i,w in enumerate(vocab)}
assert len(vocab) == N_wk.shape[0]
@@ -30,16 +29,16 @@ def infer_cvb0(invocab_tokens, alpha, numpasses):
# initialize with likelihoods
Qs = np.zeros((doclen, K))
- for i in xrange(doclen):
+ for i in range(doclen):
w = invocab_tokens[i]
Qs[i,:] = wordprobs[w2num[w],:]
Qs[i,:] /= Qs[i,:].sum()
lik = Qs.copy() # pertoken normalized but proportionally the same for inference
Q_k = Qs.sum(0)
- for itr in xrange(1,numpasses):
+ for itr in range(1,numpasses):
# print "cvb0 iter", itr
- for i in xrange(doclen):
+ for i in range(doclen):
Q_k -= Qs[i,:]
Qs[i,:] = lik[i,:] * (Q_k + alpha)
Qs[i,:] /= Qs[i,:].sum()
@@ -50,11 +49,11 @@ def infer_cvb0(invocab_tokens, alpha, numpasses):
def predict(tokens, alpha=1, numpasses=5, thresh1=1, thresh2=0.2):
if len(tokens)>0:
- assert isinstance(tokens[0], unicode)
+ assert isinstance(tokens[0], str)
invocab_tokens = [w.lower() for w in tokens if w.lower() in w2num]
# check that at least xx tokens are in vocabulary
if len(invocab_tokens) < thresh1:
- return None
+ return None
# check that at least yy% of tokens are in vocabulary
elif len(invocab_tokens) / len(tokens) < thresh2:
return None