Skip to content

Commit

Permalink
Merge pull request #1 from everping/master
Browse files Browse the repository at this point in the history
Reformat code by PEP8 and remove unused variables
  • Loading branch information
Chip Huyen committed Oct 5, 2018
2 parents 305e853 + 49e60e6 commit ba69bab
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 31 deletions.
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,20 @@ $ python3
>>> nltk.download('punkt')
```

In MacOS, you can get the SSL error

```
[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data] CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data] unable to get local issuer certificate (_ssl.c:1045)>
```

this will be fixed by reinstalling certificates
```shell
$ /Applications/Python\ 3.x/Install\ Certificates.command
```


To query for a certain keyword, run:

```bash
Expand Down
69 changes: 38 additions & 31 deletions sotawhat.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,35 @@
import datetime
import re
import sys
import urllib
import urllib.error
import urllib.request

from six.moves.html_parser import HTMLParser
h = HTMLParser()

import enchant
from nltk.tokenize import word_tokenize
from six.moves.html_parser import HTMLParser

h = HTMLParser()

AUTHOR_TAG = '<a href="/search/?searchtype=author'
TITLE_TAG = '<p class="title is-5 mathjax">'
ABSTRACT_TAG = '<span class="abstract-full has-text-grey-dark mathjax"'
DATE_TAG = '<p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span>'


def get_authors(lines, i):
authors = []
while True:
if not lines[i].startswith(AUTHOR_TAG):
break
idx = lines[i].find('>')
if lines[i].endswith(','):
authors.append(lines[i][idx + 1 : -5])
authors.append(lines[i][idx + 1: -5])
else:
authors.append(lines[i][idx + 1 : -4])
authors.append(lines[i][idx + 1: -4])
i += 1
return authors, i

def get_next_result(lines, start):

'''
def get_next_result(lines, start):
"""
Extract paper from the xml file obtained from arxiv search.
Each paper is a dict that contains:
Expand All @@ -39,11 +38,11 @@ def get_next_result(lines, start):
+ 'main_page': str
+ 'authors': []
+ 'abstract': str
'''
"""

result = {}
idx = lines[start + 3][10:].find('"')
result['main_page'] = lines[start + 3][9:10+idx]
result['main_page'] = lines[start + 3][9:10 + idx]
idx = lines[start + 4][23:].find('"')
result['pdf'] = lines[start + 4][22: 23 + idx] + '.pdf'

Expand All @@ -57,7 +56,7 @@ def get_next_result(lines, start):
title = title.replace('</span>', '')
result['title'] = title

authors, start = get_authors(lines, start + 5) # orig: add 8
authors, start = get_authors(lines, start + 5) # orig: add 8

while not lines[start].strip().startswith(ABSTRACT_TAG):
start += 1
Expand All @@ -72,34 +71,38 @@ def get_next_result(lines, start):
start += 1

idx = lines[start].find('</span> ')
end = lines[start][idx : ].find(';')
end = lines[start][idx:].find(';')

result['date'] = lines[start][idx + 8: idx + end]

return result, start


def clean_empty_lines(lines):
cleaned = []
for line in lines:
line = line.strip()
if line:
cleaned.append(line)
return cleaned
cleaned = []
for line in lines:
line = line.strip()
if line:
cleaned.append(line)
return cleaned


def is_float(token):
return re.match("^\d+?\.\d+?$", token) is not None


def is_citation_year(tokens, i):
if len(tokens[i]) != 4:
return False
if re.match(r'[12][0-9]{3}', tokens[i]) is None:
return False
if i == 0 or i == len(tokens) - 1:
return False
if (tokens[i - 1] == ',' or tokens[i - 1] == '(')and tokens[i + 1] == ')':
if (tokens[i - 1] == ',' or tokens[i - 1] == '(') and tokens[i + 1] == ')':
return True
return False


def is_list_numer(tokens, i, value):
if value < 1 or value > 4:
return False
Expand All @@ -116,9 +119,9 @@ def has_number(sent):
for i, token in enumerate(tokens):
if token.endswith('\\'):
token = token[:-2]
if token.endswith('x'): # sometimes people write numbers as 1.7x
if token.endswith('x'): # sometimes people write numbers as 1.7x
token = token[:-1]
if token.startswith('x'): # sometimes people write numbers as x1.7
if token.startswith('x'): # sometimes people write numbers as x1.7
token = token[1:]
if token.startswith('$') and token.endswith('$'):
token = token[1:-1]
Expand All @@ -133,13 +136,14 @@ def has_number(sent):

return False


def contains_sota(sent):
return 'state-of-the-art' in sent or 'state of the art' in sent or 'SOTA' in sent


def extract_line(abstract, keyword, limit):
lines = []
numbered_lines = []
has_sota = False
kw_mentioned = False
abstract = abstract.replace("et. al", "et al.")
sentences = abstract.split('. ')
Expand All @@ -150,7 +154,6 @@ def extract_line(abstract, keyword, limit):
if has_number(sent):
numbered_lines.append(sent)
elif contains_sota(sent):
has_sota = True
numbered_lines.append(sent)
else:
kw_sentences.append(sent)
Expand All @@ -167,7 +170,8 @@ def extract_line(abstract, keyword, limit):
if len(numbered_lines) > 0:
return '. '.join(numbered_lines), True
return '. '.join(lines[-2:]), False



def get_report(paper, keyword):
if keyword in paper['abstract'].lower():
title = h.unescape(paper['title'])
Expand All @@ -179,6 +183,7 @@ def get_report(paper, keyword):
return report, has_number
return '', False


def txt2reports(txt, keyword, num_to_show):
found = False
txt = ''.join(chr(c) for c in txt)
Expand Down Expand Up @@ -208,11 +213,11 @@ def txt2reports(txt, keyword, num_to_show):
break
return unshown, num_to_show, found

def get_papers(keyword, num_results=5):

'''
def get_papers(keyword, num_results=5):
"""
If keyword is an English word, then search in CS category only to avoid papers from other categories, resulted from the ambiguity
'''
"""

if keyword in set(['GAN', 'bpc']):
query_temp = 'https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term={}&terms-0-field=all&classification-computer_science=y&classification-physics_archives=all&date-filter_by=all_dates&date-year=&date-from_date=&date-to_date=&date-date_type=submitted_date&abstracts=show&size={}&order=-announced_date_first&start={}'
Expand Down Expand Up @@ -257,6 +262,7 @@ def get_papers(keyword, num_results=5):
all_unshown.extend(unshown)
page += 1


def main():
if len(sys.argv) < 2:
raise ValueError('You must specify a keyword')
Expand All @@ -273,11 +279,12 @@ def main():
return
if num_results <= 0:
raise ValueError('You must choose to show a positive number of results')

else:
num_results = 5

get_papers(keyword, num_results)


if __name__ == '__main__':
main()
main()

0 comments on commit ba69bab

Please sign in to comment.