forked from cmb-css/twitter-hoover
-
Notifications
You must be signed in to change notification settings - Fork 1
/
merge-quotes-2020.py
61 lines (51 loc) · 2.05 KB
/
merge-quotes-2020.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import json
def month2file(month):
return 'quotes-2020-{:02}.json'.format(month)
class MergeQuotes:
def __init__(self, outfile):
self.outfile = outfile
self.root_ids = set()
self.cur_tweets = {}
def _add_quotes(self, tweet):
for quote in tweet['quotes']:
tid = quote['id']
if tid not in self.root_ids:
self.cur_tweets[tid] = quote
self.root_ids.add(tid)
self._add_quotes(quote)
def _month_tweets(self, month):
self.cur_tweets = {}
with open(month2file(month), 'rt') as f:
for line in f:
tweet = json.loads(line)
tid = tweet['id']
if not tweet['is_quote'] and tid not in self.root_ids:
self.cur_tweets[tid] = tweet
self.root_ids.add(tid)
self._add_quotes(tweet)
def _merge_trees(self, month):
for i in range(month + 1, 13):
with open(month2file(i), 'rt') as f:
for line in f:
tweet = json.loads(line)
tid = tweet['id']
if tid in self.cur_tweets:
main_tweet = self.cur_tweets[tid]
for quote in tweet['quotes']:
qid = quote['id']
if qid not in main_tweet['quote_ids']:
main_tweet['quotes'].append(quote)
main_tweet['quote_ids'].append(qid)
def run(self):
for i in range(1, 13):
print('processing month: {}'.format(i))
self._month_tweets(i)
self._merge_trees(i)
with open(self.outfile, 'at', encoding='utf-8') as f:
for tid, tweet in self.cur_tweets.items():
if not tweet['is_quote']:
f.write('{}\n'.format(
json.dumps(tweet, ensure_ascii=False)))
if __name__ == '__main__':
mq = MergeQuotes('quotes-2020.json')
mq.run()