Skip to content

Commit

Permalink
Bug Fix: Should fetch all csvs everytime we backup.
Browse files Browse the repository at this point in the history
Previously we only fetch csvs only from diff pads.
However, diff pads only indicate new foldrs, or new  of existing foldrs.
We will need to fetch all csvs to backup all changes.
  • Loading branch information
JmeHsieh committed Oct 10, 2016
1 parent df696ad commit c80e81b
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 40 deletions.
19 changes: 8 additions & 11 deletions backup.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,17 +43,14 @@ def backup():
last_commit = None

diff_pads = hackpads.get_diffs(last_commit)
if not diff_pads:
logging.info('no diff pads since last pull.')
else:
hackfoldrs.gen_foldrs(diff_pads, pads_path)
hackfoldrs.copy_to_repo()
hackfoldrs.commit_push()
hackfoldrs.clean_gened_foldrs()

logging.info('update latest commit sha')
with open(join(DATA, 'last_commit.txt'), 'w') as f:
f.write(hackpads.latest_commit())
hackfoldrs.gen_foldrs(diff_pads, pads_path)
hackfoldrs.copy_to_repo()
hackfoldrs.commit_push()
hackfoldrs.clean_gened_foldrs()

logging.info('update latest commit sha')
with open(join(DATA, 'last_commit.txt'), 'w') as f:
f.write(hackpads.latest_commit())


if __name__ == '__main__':
Expand Down
61 changes: 32 additions & 29 deletions hackfoldrs.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from bs4 import BeautifulSoup, SoupStrainer
from collections import OrderedDict
from copy import deepcopy
import json
import logging
from os import listdir, makedirs
Expand All @@ -19,8 +20,8 @@ def __init__(self, repo_url, repo_path, gen_foldrs_path):
self.repo = None
self.gen_foldrs_path = gen_foldrs_path

def _extract_foldrs(self, diff_pads, pads_path):
""" Construct hackfoldr indexes by
def _find_new_foldrs(self, diff_pads, pads_path):
""" Construct new hackfoldr indexes by
scanning hackfoldr links from hackpad htmls """

with open(join(pads_path, 'pads.json'), 'r') as f:
Expand Down Expand Up @@ -83,17 +84,20 @@ def _get_csv(self, foldr_id):

return (csv, source)

def _merge_foldr(self, old, new):
old.update({k: v for k, v in new.items()
if not (isinstance(v, list) or isinstance(v, dict))})
for k, v in new.items():
def _merged_foldr(self, old, new):
_old = deepcopy(old)
_new = deepcopy(new)
for k, v in _new.items():
if isinstance(v, list):
_o = set(old.get(k, []))
_n = set(new.get(k, []))
_u = _o.union(_n)
old.update({k: sorted(list(_u))})
_o = set(_old.get(k, []))
_n = set(v)
_u = _o & _n
_old.update({k: sorted(list(_u))})
elif isinstance(v, dict):
raise NotImplementedError
else:
_old.update({k: v})
return _old

def pull_repo(self):
try:
Expand All @@ -109,33 +113,32 @@ def gen_foldrs(self, diff_pads, pads_path):
makedirs(self.gen_foldrs_path, exist_ok=True)
fn = 'foldrs.json'

# write {foldr_id}.json
foldrs = self._extract_foldrs(diff_pads, pads_path)
for _id, foldr in foldrs.items():
(csv, source) = self._get_csv(_id)
if csv:
foldr['source'] = source
with open(join(self.gen_foldrs_path, '{}.json').format(_id), 'w') as f:
json.dump(csv, f, indent=2, ensure_ascii=False)

# Set.toList foldr['hackpads']
# filter . ((HashMap.hasKey 'source') . (.value)) $ foldrs
new = {_id: f.update({'hackpads': list(f['hackpads'])}) or f
for _id, f in foldrs.items()
if 'source' in f}

# merge old foldrs with the new
# merge old w/ new
try:
with open(join(self.repo_path, fn), 'r') as f:
old = json.load(f)
except OSError:
old = {}
for _id in old:
self._merge_foldr(old[_id], new.get(_id, {}))
new = self._find_new_foldrs(diff_pads, pads_path)
mix = {_id: self._merged_foldr(old.get(_id, {}), new.get(_id, {}))
for _id in old.keys() | new.keys()}

# fetch csvs
for _id, foldr in mix.items():
csv, source = self._get_csv(_id)
if csv:
foldr['source'] = source
with open(join(self.gen_foldrs_path, '{}.json').format(_id), 'w') as f:
json.dump(csv, f, indent=2, ensure_ascii=False)

# convert set to list
# remove foldrs without 'source' key
clean = {_id: f.update({'hackpads': list(f['hackpads'])}) or f
for _id, f in mix.items() if 'source' in f}

# write foldrs.json
with open(join(self.gen_foldrs_path, fn), 'w') as f:
json.dump(old, f, sort_keys=True, indent=2, ensure_ascii=False)
json.dump(clean, f, sort_keys=True, indent=2, ensure_ascii=False)

logging.info('gen foldrs complete')

Expand Down

0 comments on commit c80e81b

Please sign in to comment.