Skip to content

Commit

Permalink
v 0.10.0
Browse files Browse the repository at this point in the history
  • Loading branch information
Chip Nguyen committed Oct 2, 2018
0 parents commit 305e853
Show file tree
Hide file tree
Showing 3 changed files with 324 additions and 0 deletions.
39 changes: 39 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# sotawhat

This script runs using Python 3.

First, install the required packages. This script only requires ``nltk`` and ``PyEnchant``.

```bash
$ pip install -r requirements.txt
```

If you run the error that the package ``punkt`` doesn't exist, download it by going into your Python environment and running:

```bash
$ python3

>>> import nltk
>>> nltk.download('punkt')
```

To query for a certain keyword, run:

```bash
$ python3 sotawhat.py "[keyword]" [number of results]
```

For example:

```bash
$ python3 sotawhat.py "perplexity" 10
```

If you don't specify the number of results, by default, the script returns 5 results. Each result contains the title of the paper with author and published date, a summary of the abstract, and link to the paper.

We've found that this script works well with keywords that are:
+ a model (e.g. transformer, wavenet, ...)
+ a dataset (e.g. wikitext, imagenet, ...)
+ a task (e.g. 'language model', 'machine translation', 'fuzzing', ...)
+ a metric (e.g. BLEU, perplexity, ...)
+ random stuff
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
PyEnchant
nltk
283 changes: 283 additions & 0 deletions sotawhat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,283 @@
import datetime
import re
import sys
import urllib
import urllib.request

from six.moves.html_parser import HTMLParser
h = HTMLParser()

import enchant
from nltk.tokenize import word_tokenize

AUTHOR_TAG = '<a href="/search/?searchtype=author'
TITLE_TAG = '<p class="title is-5 mathjax">'
ABSTRACT_TAG = '<span class="abstract-full has-text-grey-dark mathjax"'
DATE_TAG = '<p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span>'

def get_authors(lines, i):
authors = []
while True:
if not lines[i].startswith(AUTHOR_TAG):
break
idx = lines[i].find('>')
if lines[i].endswith(','):
authors.append(lines[i][idx + 1 : -5])
else:
authors.append(lines[i][idx + 1 : -4])
i += 1
return authors, i

def get_next_result(lines, start):

'''
Extract paper from the xml file obtained from arxiv search.
Each paper is a dict that contains:
+ 'title': str
+ 'pdf_link': str
+ 'main_page': str
+ 'authors': []
+ 'abstract': str
'''

result = {}
idx = lines[start + 3][10:].find('"')
result['main_page'] = lines[start + 3][9:10+idx]
idx = lines[start + 4][23:].find('"')
result['pdf'] = lines[start + 4][22: 23 + idx] + '.pdf'

start += 4

while lines[start].strip() != TITLE_TAG:
start += 1

title = lines[start + 1].strip()
title = title.replace('<span class="search-hit mathjax">', '')
title = title.replace('</span>', '')
result['title'] = title

authors, start = get_authors(lines, start + 5) # orig: add 8

while not lines[start].strip().startswith(ABSTRACT_TAG):
start += 1
abstract = lines[start + 1]
abstract = abstract.replace('<span class="search-hit mathjax">', '')
abstract = abstract.replace('</span>', '')
result['abstract'] = abstract

result['authors'] = authors

while not lines[start].strip().startswith(DATE_TAG):
start += 1

idx = lines[start].find('</span> ')
end = lines[start][idx : ].find(';')

result['date'] = lines[start][idx + 8: idx + end]

return result, start

def clean_empty_lines(lines):
cleaned = []
for line in lines:
line = line.strip()
if line:
cleaned.append(line)
return cleaned

def is_float(token):
return re.match("^\d+?\.\d+?$", token) is not None

def is_citation_year(tokens, i):
if len(tokens[i]) != 4:
return False
if re.match(r'[12][0-9]{3}', tokens[i]) is None:
return False
if i == 0 or i == len(tokens) - 1:
return False
if (tokens[i - 1] == ',' or tokens[i - 1] == '(')and tokens[i + 1] == ')':
return True
return False

def is_list_numer(tokens, i, value):
if value < 1 or value > 4:
return False
if i == len(tokens) - 1:
return False

if (i == 0 or tokens[i - 1] in set(['(', '.', ':'])) and tokens[i + 1] == ')':
return True
return False


def has_number(sent):
tokens = word_tokenize(sent)
for i, token in enumerate(tokens):
if token.endswith('\\'):
token = token[:-2]
if token.endswith('x'): # sometimes people write numbers as 1.7x
token = token[:-1]
if token.startswith('x'): # sometimes people write numbers as x1.7
token = token[1:]
if token.startswith('$') and token.endswith('$'):
token = token[1:-1]
if is_float(token):
return True
try:
value = int(token)
except:
continue
if (not is_citation_year(tokens, i)) and (not is_list_numer(tokens, i, value)):
return True

return False

def contains_sota(sent):
return 'state-of-the-art' in sent or 'state of the art' in sent or 'SOTA' in sent

def extract_line(abstract, keyword, limit):
lines = []
numbered_lines = []
has_sota = False
kw_mentioned = False
abstract = abstract.replace("et. al", "et al.")
sentences = abstract.split('. ')
kw_sentences = []
for i, sent in enumerate(sentences):
if keyword in sent.lower():
kw_mentioned = True
if has_number(sent):
numbered_lines.append(sent)
elif contains_sota(sent):
has_sota = True
numbered_lines.append(sent)
else:
kw_sentences.append(sent)
lines.append(sent)
continue

if kw_mentioned and has_number(sent):
if not numbered_lines:
numbered_lines.append(kw_sentences[-1])
numbered_lines.append(sent)
if kw_mentioned and contains_sota(sent):
lines.append(sent)

if len(numbered_lines) > 0:
return '. '.join(numbered_lines), True
return '. '.join(lines[-2:]), False

def get_report(paper, keyword):
if keyword in paper['abstract'].lower():
title = h.unescape(paper['title'])
headline = '{} ({} - {})\n'.format(title, paper['authors'][0], paper['date'])
abstract = h.unescape(paper['abstract'])
extract, has_number = extract_line(abstract, keyword, 280 - len(headline))
if extract:
report = headline + extract + '\nLink: {}'.format(paper['main_page'])
return report, has_number
return '', False

def txt2reports(txt, keyword, num_to_show):
found = False
txt = ''.join(chr(c) for c in txt)
lines = txt.split('\n')
lines = clean_empty_lines(lines)
unshown = []

for i in range(len(lines)):
if num_to_show <= 0:
return unshown, num_to_show, found

line = lines[i].strip()
if len(line) == 0:
continue
if line == '<li class="arxiv-result">':
found = True
paper, i = get_next_result(lines, i)
report, has_number = get_report(paper, keyword)

if has_number:
print(report)
print('====================================================')
num_to_show -= 1
elif report:
unshown.append(report)
if line == '</ol>':
break
return unshown, num_to_show, found

def get_papers(keyword, num_results=5):

'''
If keyword is an English word, then search in CS category only to avoid papers from other categories, resulted from the ambiguity
'''

if keyword in set(['GAN', 'bpc']):
query_temp = 'https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term={}&terms-0-field=all&classification-computer_science=y&classification-physics_archives=all&date-filter_by=all_dates&date-year=&date-from_date=&date-to_date=&date-date_type=submitted_date&abstracts=show&size={}&order=-announced_date_first&start={}'
keyword = keyword.lower()
else:
keyword = keyword.lower()
d = enchant.Dict('en_US')
if d.check(keyword):
query_temp = 'https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term={}&terms-0-field=all&classification-computer_science=y&classification-physics_archives=all&date-filter_by=all_dates&date-year=&date-from_date=&date-to_date=&date-date_type=submitted_date&abstracts=show&size={}&order=-announced_date_first&start={}'
else:
query_temp = 'https://arxiv.org/search/?searchtype=all&query={}&abstracts=show&size={}&order=-announced_date_first&start={}'
keyword_q = keyword.replace(' ', '+')
page = 0
per_page = 200
num_to_show = num_results
all_unshown = []

while num_to_show > 0:
query = query_temp.format(keyword_q, str(per_page), str(per_page * page))

req = urllib.request.Request(query)
try:
response = urllib.request.urlopen(req)
except urllib.error.HTTPError as e:
print('Error {}: problem accessing the server'.format(e.code))
return

txt = response.read()
unshown, num_to_show, found = txt2reports(txt, keyword, num_to_show)
if not found and not all_unshown and num_to_show == num_results:
print('Sorry, we were unable to find any abstract with the word {}'.format(keyword))
return

if num_to_show < num_results / 2 or not found:
for report in all_unshown[:num_to_show]:
print(report)
print('====================================================')
if not found:
return
num_to_show -= len(all_unshown)
else:
all_unshown.extend(unshown)
page += 1

def main():
if len(sys.argv) < 2:
raise ValueError('You must specify a keyword')
if len(sys.argv) > 3:
raise ValueError("Too many arguments")

keyword = sys.argv[1]

if len(sys.argv) == 3:
try:
num_results = int(sys.argv[2])
except:
print('The second argument must be an integer')
return
if num_results <= 0:
raise ValueError('You must choose to show a positive number of results')

else:
num_results = 5

get_papers(keyword, num_results)

if __name__ == '__main__':
main()

0 comments on commit 305e853

Please sign in to comment.