Skip to content

Commit

Permalink
perf(loader): add loading in threads
Browse files Browse the repository at this point in the history
  • Loading branch information
pavellos21 committed Jul 16, 2020
1 parent 75b3c46 commit f99fc2e
Showing 1 changed file with 68 additions and 46 deletions.
114 changes: 68 additions & 46 deletions src/loaders/WikiDataWithContextLoader.py
Original file line number Diff line number Diff line change
@@ -1,62 +1,84 @@
from .WikiDataLoader import WikiDataLoader
import wptools
import re
import threading
from queue import Queue


class WikiDataWithContextLoader(WikiDataLoader):
def __init__(self):
super().__init__()
self._lock = threading.Lock()

def load_object(self, obj, obj_type):
page = wptools.page(wikibase=obj, skip=['labels'], silent=True)
de_page = wptools.page(wikibase=obj, lang='de', skip=[
'labels', 'imageinfo'], silent=True)
ru_page = wptools.page(wikibase=obj, lang='ru', skip=[
'labels', 'imageinfo'], silent=True)

page.get_wikidata()
de_page.get_wikidata()
ru_page.get_wikidata()

try:
obj_id = re.sub(r"\+|-|–|\/|:|\s", '_',
re.sub(r"'s?|\(|\)|,", '', page.data['title']))
except KeyError:
obj_id = obj

with self._lock:
self._info[obj_type][obj] = {
'identifier': obj_id,
'label': {'en': page.data['label'], 'de': de_page.data['label'], 'ru': ru_page.data['label']},
'description': {'en': page.data['description'], 'de': de_page.data['description'], 'ru': ru_page.data['description']}
}
self.add_image_url(page, obj_type, obj)

def thread_fun(self, queue):
while True:
obj, obj_type = queue.get()
self.load_object(obj, obj_type)
queue.task_done()

def resolve_ids(self):
for triplet in self._info['triplets']:
_, rlt, ent = triplet
triplet[1] = self._info['relations'][rlt]['identifier']
triplet[2] = self._info['entities'][ent]['identifier']

temp_ent = self._info['entities']
self._info['entities'] = {}
for ent in temp_ent:
self._info['entities'][temp_ent[ent]['identifier']] = temp_ent[ent]

temp_rlt = self._info['relations']
self._info['relations'] = {}
for rlt in temp_rlt:
self._info['relations'][temp_rlt[rlt]
['identifier']] = temp_rlt[rlt]

def getEntity(self, entity, lang='en'):
super().getEntity(entity, lang=lang)
self._page = wptools.page(wikibase=self._page.data['wikibase'], lang='en', skip=[
'labels', 'imageinfo'], silent=True)
self._page.get_wikidata()
page_title = re.sub(r"\+|-|–|\/|:|\s", '_',
re.sub(r"'s?|\(|\)|,", '', self._page.data['title']))
loading_queue = Queue()

for _ in range(16):
threading.Thread(target=self.thread_fun, args=[
loading_queue, ], daemon=True).start()

context = self._page.data['claims']
for rlt, ents in context.items():
rlt_page = wptools.page(
wikibase=rlt, skip=['labels', 'imageinfo'], silent=True)
rlt_page.get_wikidata()
rlt_de_page = wptools.page(wikibase=rlt, lang='de', skip=[
'labels', 'imageinfo'], silent=True)
rlt_de_page.get_wikidata()
rlt_ru_page = wptools.page(wikibase=rlt, lang='ru', skip=[
'labels', 'imageinfo'], silent=True)
rlt_ru_page.get_wikidata()
if 'title' in rlt_page.data:
rlt_id = re.sub(
r"\+|-|–|\/|:|\s", '_', re.sub(r"'s?|\(|\)|,", '', rlt_page.data['title']))
else:
continue

self._info['relations'][rlt_id] = {
'identifier': rlt_id,
'label': {'en': rlt_page.data['label'], 'de': rlt_de_page.data['label'], 'ru': rlt_ru_page.data['label']},
'description': {'en': rlt_page.data['description'], 'de': rlt_de_page.data['description'], 'ru': rlt_ru_page.data['description']}
}

loading_queue.put([rlt, 'relations'])
for ent in ents:
if type(ent) is str:
if ent.startswith('Q'):
ent_page = wptools.page(
wikibase=ent, skip=['labels'], silent=True)
ent_page.get_wikidata()
ent_de_page = wptools.page(wikibase=ent, lang='de', skip=[
'labels', 'imageinfo'], silent=True)
ent_de_page.get_wikidata()
ent_ru_page = wptools.page(wikibase=ent, lang='ru', skip=[
'labels', 'imageinfo'], silent=True)
ent_ru_page.get_wikidata()
if 'title' in ent_page.data:
ent_id = re.sub(
r"\+|-|–|\/|:|\s", '_', re.sub(r"'s?|\(|\)|,", '', ent_page.data['title']))
else:
continue

self._info['entities'][ent_id] = {
'identifier': ent_id,
'label': {'en': ent_page.data['label'], 'de': ent_de_page.data['label'], 'ru': ent_ru_page.data['label']},
'description': {'en': ent_page.data['description'], 'de': ent_de_page.data['description'], 'ru': ent_ru_page.data['description']}
}

self.add_image_url(ent_page, 'entities', ent_id)

self._info['triplets'].append(
[self._page.data['title'], rlt_id, ent_id])
loading_queue.put([ent, 'entities'])
self._info['triplets'].append([page_title, rlt, ent])

loading_queue.join()
self.resolve_ids()

0 comments on commit f99fc2e

Please sign in to comment.