Skip to content

Commit

Permalink
Merge branch 'master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
hmchuong committed Nov 9, 2018
2 parents f2be3b5 + 806308a commit 9332e0e
Show file tree
Hide file tree
Showing 5 changed files with 119 additions and 58 deletions.
45 changes: 34 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,40 +2,63 @@

To see the live demo of this project, you can try in this [link](https://sotawhat.herokuapp.com/#/)

This script runs using Python 3.
This script runs using Python 3. It requires ``nltk``, ``six``, and ``pyspellchecker``. To install it as a Python package, follow the following steps:

First, install the required packages. This script only requires ``nltk`` and ``PyEnchant``.

Step 1: clone this repo, and go inside that repo:
```bash
$ pip install -r requirements.txt
$ git clone [HTTPS or SSH linnk to this repo]
$ cd sotawhat
```

If you run the error that the package ``punkt`` doesn't exist, download it by going into your Python environment and running:
Step 2: install using pip

```bash
$ python3
$ pip3 install .
```

On Windows, due to encoding errors, the script may cause issues when run on the command line. It is
recommended to use `pip install win-unicode-console --upgrade` prior to launching the script. If you get
UnicodeEncodingError, you *must* install the above.

In MacOS, you can get the SSL error

```
[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data] CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data] unable to get local issuer certificate (_ssl.c:1045)>
```

>>> import nltk
>>> nltk.download('punkt')
this will be fixed by reinstalling certificates
```shell
$ /Applications/Python\ 3.x/Install\ Certificates.command
```

# Usage
This project adds the `sotawhat` script for you to run globally on Terminal or commandline.

To query for a certain keyword, run:

```bash
$ python3 sotawhat.py "[keyword]" [number of results]
$ sotawhat [keyword] [number of results]
```

For example:

```bash
$ python3 sotawhat.py "perplexity" 10
$ sotawhat perplexity 10
```

or

```bash
$ sotawhat language model 10
```

If you don't specify the number of results, by default, the script returns 5 results. Each result contains the title of the paper with author and published date, a summary of the abstract, and link to the paper.

We've found that this script works well with keywords that are:
+ a model (e.g. transformer, wavenet, ...)
+ a dataset (e.g. wikitext, imagenet, ...)
+ a task (e.g. 'language model', 'machine translation', 'fuzzing', ...)
+ a task (e.g. language model, machine translation, fuzzing, ...)
+ a metric (e.g. BLEU, perplexity, ...)
+ random stuff
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
PyEnchant
nltk
nltk
six
pyspellchecker
17 changes: 17 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from setuptools import setup, find_packages
import sotawhat

setup(
name='sotawhat',
version=str(sotawhat.__VERSION__),
packages=find_packages(),
description='arxiv-sanity query script',
long_description=str('SOTAwhat is a script to query Arxiv for the latest '
'abstracts and extract summaries from them. '),
url='https://huyenchip.com/2018/10/04/sotawhat.html',
license="",
install_requires=['six', 'nltk', 'pyspellchecker'],
entry_points={
'console_scripts': ['sotawhat=sotawhat.sotawhat:main'],
}
)
1 change: 1 addition & 0 deletions sotawhat/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__VERSION__ = '0.0.1'
109 changes: 64 additions & 45 deletions sotawhat.py → sotawhat/sotawhat.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,56 @@
import datetime
import os
import re
import sys
import urllib
import urllib.error
import urllib.request

import nltk
from nltk.tokenize import word_tokenize
from six.moves.html_parser import HTMLParser
h = HTMLParser()
from spellchecker import SpellChecker

import enchant
from nltk.tokenize import word_tokenize
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')

h = HTMLParser()

AUTHOR_TAG = '<a href="/search/?searchtype=author'
TITLE_TAG = '<p class="title is-5 mathjax">'
ABSTRACT_TAG = '<span class="abstract-full has-text-grey-dark mathjax"'
DATE_TAG = '<p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span>'


def get_authors(lines, i):
authors = []
while True:
if not lines[i].startswith(AUTHOR_TAG):
break
idx = lines[i].find('>')
if lines[i].endswith(','):
authors.append(lines[i][idx + 1 : -5])
authors.append(lines[i][idx + 1: -5])
else:
authors.append(lines[i][idx + 1 : -4])
authors.append(lines[i][idx + 1: -4])
i += 1
return authors, i

def get_next_result(lines, start):

'''
def get_next_result(lines, start):
"""
Extract paper from the xml file obtained from arxiv search.
Each paper is a dict that contains:
+ 'title': str
+ 'pdf_link': str
+ 'main_page': str
+ 'authors': []
+ 'abstract': str
'''
"""

result = {}
idx = lines[start + 3][10:].find('"')
result['main_page'] = lines[start + 3][9:10+idx]
result['main_page'] = lines[start + 3][9:10 + idx]
idx = lines[start + 4][23:].find('"')
result['pdf'] = lines[start + 4][22: 23 + idx] + '.pdf'

Expand All @@ -57,7 +64,7 @@ def get_next_result(lines, start):
title = title.replace('</span>', '')
result['title'] = title

authors, start = get_authors(lines, start + 5) # orig: add 8
authors, start = get_authors(lines, start + 5) # orig: add 8

while not lines[start].strip().startswith(ABSTRACT_TAG):
start += 1
Expand All @@ -72,34 +79,38 @@ def get_next_result(lines, start):
start += 1

idx = lines[start].find('</span> ')
end = lines[start][idx : ].find(';')
end = lines[start][idx:].find(';')

result['date'] = lines[start][idx + 8: idx + end]

return result, start


def clean_empty_lines(lines):
cleaned = []
for line in lines:
line = line.strip()
if line:
cleaned.append(line)
return cleaned
cleaned = []
for line in lines:
line = line.strip()
if line:
cleaned.append(line)
return cleaned


def is_float(token):
return re.match("^\d+?\.\d+?$", token) is not None


def is_citation_year(tokens, i):
if len(tokens[i]) != 4:
return False
if re.match(r'[12][0-9]{3}', tokens[i]) is None:
return False
if i == 0 or i == len(tokens) - 1:
return False
if (tokens[i - 1] == ',' or tokens[i - 1] == '(')and tokens[i + 1] == ')':
if (tokens[i - 1] == ',' or tokens[i - 1] == '(') and tokens[i + 1] == ')':
return True
return False


def is_list_numer(tokens, i, value):
if value < 1 or value > 4:
return False
Expand All @@ -116,9 +127,9 @@ def has_number(sent):
for i, token in enumerate(tokens):
if token.endswith('\\'):
token = token[:-2]
if token.endswith('x'): # sometimes people write numbers as 1.7x
if token.endswith('x'): # sometimes people write numbers as 1.7x
token = token[:-1]
if token.startswith('x'): # sometimes people write numbers as x1.7
if token.startswith('x'): # sometimes people write numbers as x1.7
token = token[1:]
if token.startswith('$') and token.endswith('$'):
token = token[1:-1]
Expand All @@ -133,13 +144,14 @@ def has_number(sent):

return False


def contains_sota(sent):
return 'state-of-the-art' in sent or 'state of the art' in sent or 'SOTA' in sent


def extract_line(abstract, keyword, limit):
lines = []
numbered_lines = []
has_sota = False
kw_mentioned = False
abstract = abstract.replace("et. al", "et al.")
sentences = abstract.split('. ')
Expand All @@ -150,7 +162,6 @@ def extract_line(abstract, keyword, limit):
if has_number(sent):
numbered_lines.append(sent)
elif contains_sota(sent):
has_sota = True
numbered_lines.append(sent)
else:
kw_sentences.append(sent)
Expand All @@ -167,7 +178,8 @@ def extract_line(abstract, keyword, limit):
if len(numbered_lines) > 0:
return '. '.join(numbered_lines), True
return '. '.join(lines[-2:]), False



def get_report(paper, keyword):
if keyword in paper['abstract'].lower():
title = h.unescape(paper['title'])
Expand All @@ -179,6 +191,7 @@ def get_report(paper, keyword):
return report, has_number
return '', False


def txt2reports(txt, keyword, num_to_show):
found = False
txt = ''.join(chr(c) for c in txt)
Expand Down Expand Up @@ -208,19 +221,20 @@ def txt2reports(txt, keyword, num_to_show):
break
return unshown, num_to_show, found

def get_papers(keyword, num_results=5):

'''
def get_papers(keyword, num_results=5):
"""
If keyword is an English word, then search in CS category only to avoid papers from other categories, resulted from the ambiguity
'''
"""

if keyword in set(['GAN', 'bpc']):
query_temp = 'https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term={}&terms-0-field=all&classification-computer_science=y&classification-physics_archives=all&date-filter_by=all_dates&date-year=&date-from_date=&date-to_date=&date-date_type=submitted_date&abstracts=show&size={}&order=-announced_date_first&start={}'
keyword = keyword.lower()
else:
keyword = keyword.lower()
d = enchant.Dict('en_US')
if d.check(keyword):
words = keyword.split()
d = SpellChecker()
if not d.unknown(words):
query_temp = 'https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term={}&terms-0-field=all&classification-computer_science=y&classification-physics_archives=all&date-filter_by=all_dates&date-year=&date-from_date=&date-to_date=&date-date_type=submitted_date&abstracts=show&size={}&order=-announced_date_first&start={}'
else:
query_temp = 'https://arxiv.org/search/?searchtype=all&query={}&abstracts=show&size={}&order=-announced_date_first&start={}'
Expand Down Expand Up @@ -257,27 +271,32 @@ def get_papers(keyword, num_results=5):
all_unshown.extend(unshown)
page += 1


def main():
if 'nt' in os.name:
try:
import win_unicode_console
win_unicode_console.enable()
except ImportError:
warnings.warn('On Windows, encoding errors may arise when displaying the data.\n'
'If such errors occur, please install `win_unicode_consolde` via \n'
'the command `pip install win-unicode-console`.')

if len(sys.argv) < 2:
raise ValueError('You must specify a keyword')
if len(sys.argv) > 3:
raise ValueError("Too many arguments")

keyword = sys.argv[1]
try:
num_results = int(sys.argv[-1])
assert num_results > 0, 'You must choose to show a positive number of results'
keyword = ' '.join(sys.argv[1:-1])

if len(sys.argv) == 3:
try:
num_results = int(sys.argv[2])
except:
print('The second argument must be an integer')
return
if num_results <= 0:
raise ValueError('You must choose to show a positive number of results')

else:
except ValueError:
keyword = ' '.join(sys.argv[1:])
num_results = 5


get_papers(keyword, num_results)


if __name__ == '__main__':
main()
main()

0 comments on commit 9332e0e

Please sign in to comment.