v 0.10.0

chiphuyen · Oct 2, 2018 · 305e853 · 305e853
commit 305e853
Show file tree

Hide file tree

Showing 3 changed files with 324 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,39 @@
+# sotawhat
+
+This script runs using Python 3.
+
+First, install the required packages. This script only requires ``nltk`` and ``PyEnchant``.
+
+```bash
+$ pip install -r requirements.txt
+```
+
+If you run the error that the package ``punkt`` doesn't exist, download it by going into your Python environment and running:
+
+```bash
+$ python3
+
+>>> import nltk
+>>> nltk.download('punkt')
+```
+
+To query for a certain keyword, run:
+
+```bash
+$ python3 sotawhat.py "[keyword]" [number of results]
+```
+
+For example:
+
+```bash
+$ python3 sotawhat.py "perplexity" 10
+```
+
+If you don't specify the number of results, by default, the script returns 5 results. Each result contains the title of the paper with author and published date, a summary of the abstract, and link to the paper.
+
+We've found that this script works well with keywords that are:
++ a model (e.g. transformer, wavenet, ...)
++ a dataset (e.g. wikitext, imagenet, ...)
++ a task (e.g. 'language model', 'machine translation', 'fuzzing', ...)
++ a metric (e.g. BLEU, perplexity, ...)
++ random stuff
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,2 @@
+PyEnchant
+nltk
diff --git a/sotawhat.py b/sotawhat.py
@@ -0,0 +1,283 @@
+import datetime
+import re
+import sys
+import urllib
+import urllib.request
+
+from six.moves.html_parser import HTMLParser
+h = HTMLParser()
+
+import enchant
+from nltk.tokenize import word_tokenize
+
+AUTHOR_TAG = '<a href="/search/?searchtype=author'
+TITLE_TAG = '<p class="title is-5 mathjax">'
+ABSTRACT_TAG = '<span class="abstract-full has-text-grey-dark mathjax"'
+DATE_TAG = '<p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span>'
+
+def get_authors(lines, i):
+    authors = []
+    while True:
+        if not lines[i].startswith(AUTHOR_TAG):
+            break
+        idx = lines[i].find('>')
+        if lines[i].endswith(','):
+            authors.append(lines[i][idx + 1 : -5])
+        else:
+            authors.append(lines[i][idx + 1 : -4])
+        i += 1
+    return authors, i
+
+def get_next_result(lines, start):
+
+    '''
+    Extract paper from the xml file obtained from arxiv search.
+    
+    Each paper is a dict that contains:
+    + 'title': str
+    + 'pdf_link': str
+    + 'main_page': str
+    + 'authors': []
+    + 'abstract': str
+    '''
+
+    result = {}
+    idx = lines[start + 3][10:].find('"')
+    result['main_page'] = lines[start + 3][9:10+idx]
+    idx = lines[start + 4][23:].find('"')
+    result['pdf'] = lines[start + 4][22: 23 + idx] + '.pdf'
+
+    start += 4
+
+    while lines[start].strip() != TITLE_TAG:
+        start += 1
+
+    title = lines[start + 1].strip()
+    title = title.replace('<span class="search-hit mathjax">', '')
+    title = title.replace('</span>', '')
+    result['title'] = title
+
+    authors, start = get_authors(lines, start + 5) # orig: add 8
+
+    while not lines[start].strip().startswith(ABSTRACT_TAG):
+        start += 1
+    abstract = lines[start + 1]
+    abstract = abstract.replace('<span class="search-hit mathjax">', '')
+    abstract = abstract.replace('</span>', '')
+    result['abstract'] = abstract
+
+    result['authors'] = authors
+
+    while not lines[start].strip().startswith(DATE_TAG):
+        start += 1
+
+    idx = lines[start].find('</span> ')
+    end = lines[start][idx : ].find(';')
+
+    result['date'] = lines[start][idx + 8: idx + end]
+
+    return result, start
+
+def clean_empty_lines(lines):
+   cleaned = []
+   for line in lines:
+      line = line.strip()
+      if line:
+         cleaned.append(line)
+   return cleaned
+
+def is_float(token):
+    return re.match("^\d+?\.\d+?$", token) is not None
+
+def is_citation_year(tokens, i):
+    if len(tokens[i]) != 4:
+        return False
+    if re.match(r'[12][0-9]{3}', tokens[i]) is None:
+        return False
+    if i == 0 or i == len(tokens) - 1:
+        return False
+    if (tokens[i - 1] == ',' or tokens[i - 1] == '(')and tokens[i + 1] == ')':
+        return True
+    return False
+
+def is_list_numer(tokens, i, value):
+    if value < 1 or value > 4:
+        return False
+    if i == len(tokens) - 1:
+        return False
+
+    if (i == 0 or tokens[i - 1] in set(['(', '.', ':'])) and tokens[i + 1] == ')':
+        return True
+    return False
+
+
+def has_number(sent):
+    tokens = word_tokenize(sent)
+    for i, token in enumerate(tokens):
+        if token.endswith('\\'):
+            token = token[:-2]
+        if token.endswith('x'): # sometimes people write numbers as 1.7x
+            token = token[:-1]
+        if token.startswith('x'): # sometimes people write numbers as x1.7
+            token = token[1:]
+        if token.startswith('$') and token.endswith('$'):
+            token = token[1:-1]
+        if is_float(token):
+            return True
+        try:
+            value = int(token)
+        except:
+            continue
+        if (not is_citation_year(tokens, i)) and (not is_list_numer(tokens, i, value)):
+            return True
+
+    return False
+
+def contains_sota(sent):
+    return 'state-of-the-art' in sent or 'state of the art' in sent or 'SOTA' in sent
+
+def extract_line(abstract, keyword, limit):
+    lines = []
+    numbered_lines = []
+    has_sota = False
+    kw_mentioned = False
+    abstract = abstract.replace("et. al", "et al.")
+    sentences = abstract.split('. ')
+    kw_sentences = []
+    for i, sent in enumerate(sentences):
+        if keyword in sent.lower():
+            kw_mentioned = True
+            if has_number(sent):
+                numbered_lines.append(sent)
+            elif contains_sota(sent):
+                has_sota = True
+                numbered_lines.append(sent)
+            else:
+                kw_sentences.append(sent)
+                lines.append(sent)
+            continue
+
+        if kw_mentioned and has_number(sent):
+            if not numbered_lines:
+                numbered_lines.append(kw_sentences[-1])
+            numbered_lines.append(sent)
+        if kw_mentioned and contains_sota(sent):
+            lines.append(sent)
+
+    if len(numbered_lines) > 0:
+        return '. '.join(numbered_lines), True
+    return '. '.join(lines[-2:]), False
+
+def get_report(paper, keyword):
+    if keyword in paper['abstract'].lower():
+        title = h.unescape(paper['title'])
+        headline = '{} ({} - {})\n'.format(title, paper['authors'][0], paper['date'])
+        abstract = h.unescape(paper['abstract'])
+        extract, has_number = extract_line(abstract, keyword, 280 - len(headline))
+        if extract:
+            report = headline + extract + '\nLink: {}'.format(paper['main_page'])
+            return report, has_number
+    return '', False
+
+def txt2reports(txt, keyword, num_to_show):
+    found = False
+    txt = ''.join(chr(c) for c in txt)
+    lines = txt.split('\n')
+    lines = clean_empty_lines(lines)
+    unshown = []
+
+    for i in range(len(lines)):
+        if num_to_show <= 0:
+            return unshown, num_to_show, found
+
+        line = lines[i].strip()
+        if len(line) == 0:
+            continue
+        if line == '<li class="arxiv-result">':
+            found = True
+            paper, i = get_next_result(lines, i)
+            report, has_number = get_report(paper, keyword)
+
+            if has_number:
+                print(report)
+                print('====================================================')
+                num_to_show -= 1
+            elif report:
+                unshown.append(report)
+        if line == '</ol>':
+            break
+    return unshown, num_to_show, found
+
+def get_papers(keyword, num_results=5):
+
+    '''
+    If keyword is an English word, then search in CS category only to avoid papers from other categories, resulted from the ambiguity
+    '''
+
+    if keyword in set(['GAN', 'bpc']):
+        query_temp = 'https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term={}&terms-0-field=all&classification-computer_science=y&classification-physics_archives=all&date-filter_by=all_dates&date-year=&date-from_date=&date-to_date=&date-date_type=submitted_date&abstracts=show&size={}&order=-announced_date_first&start={}'
+        keyword = keyword.lower()
+    else:
+        keyword = keyword.lower()
+        d = enchant.Dict('en_US')
+        if d.check(keyword):
+            query_temp = 'https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term={}&terms-0-field=all&classification-computer_science=y&classification-physics_archives=all&date-filter_by=all_dates&date-year=&date-from_date=&date-to_date=&date-date_type=submitted_date&abstracts=show&size={}&order=-announced_date_first&start={}'
+        else:
+            query_temp = 'https://arxiv.org/search/?searchtype=all&query={}&abstracts=show&size={}&order=-announced_date_first&start={}'
+    keyword_q = keyword.replace(' ', '+')
+    page = 0
+    per_page = 200
+    num_to_show = num_results
+    all_unshown = []
+
+    while num_to_show > 0:
+        query = query_temp.format(keyword_q, str(per_page), str(per_page * page))
+
+        req = urllib.request.Request(query)
+        try:
+            response = urllib.request.urlopen(req)
+        except urllib.error.HTTPError as e:
+            print('Error {}: problem accessing the server'.format(e.code))
+            return
+
+        txt = response.read()
+        unshown, num_to_show, found = txt2reports(txt, keyword, num_to_show)
+        if not found and not all_unshown and num_to_show == num_results:
+            print('Sorry, we were unable to find any abstract with the word {}'.format(keyword))
+            return
+
+        if num_to_show < num_results / 2 or not found:
+            for report in all_unshown[:num_to_show]:
+                print(report)
+                print('====================================================')
+            if not found:
+                return
+            num_to_show -= len(all_unshown)
+        else:
+            all_unshown.extend(unshown)
+        page += 1
+
+def main():
+    if len(sys.argv) < 2:
+        raise ValueError('You must specify a keyword')
+    if len(sys.argv) > 3:
+        raise ValueError("Too many arguments")
+
+    keyword = sys.argv[1]
+
+    if len(sys.argv) == 3:
+        try:
+            num_results = int(sys.argv[2])
+        except:
+            print('The second argument must be an integer')
+            return
+        if num_results <= 0:
+            raise ValueError('You must choose to show a positive number of results')
+
+    else:
+        num_results = 5
+
+    get_papers(keyword, num_results)
+
+if __name__ == '__main__':
+    main()