Skip to content

Commit

Permalink
Add Windows script support, setup.py installation support
Browse files Browse the repository at this point in the history
  • Loading branch information
titu1994 committed Oct 5, 2018
1 parent 1f7a8ab commit 2ba3cea
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 35 deletions.
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
PyEnchant
nltk
nltk
six
PyEnchant[full]
22 changes: 22 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from setuptools import setup, find_packages
import sotawhat

setup(
name='sotawhat',
version=str(sotawhat.__VERSION__),
packages=find_packages(),
description='arxiv-sanity query script',
long_description=str('I often get frustrated searching for the latest '
'research results on Google and Arxiv so I wrote '
'SOTAwhat, a script to query Arxiv for the latest '
'abstracts and extract summaries from them. '),
url='https://huyenchip.com/2018/10/04/sotawhat.html',
license="",
install_requires=['six', 'nltk'],
extras_require={
'full': ['PyEnchant'],
},
entry_points={
'console_scripts': ['sotawhat=sotawhat.cmd_line:main'],
}
)
1 change: 1 addition & 0 deletions sotawhat/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__VERSION__ = '0.0.1'
31 changes: 31 additions & 0 deletions sotawhat/cmd_line.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import os
import argparse
import warnings
from sotawhat import sotawhat


def main():
parser = argparse.ArgumentParser('Query arxiv-sanity to obtain most recent sota papers.')

parser.add_argument('query', type=str,
help='Keyword to query')

parser.add_argument('-c', '--count', type=int, default=5,
help='Number of results to display')

parser.add_argument('-e', '--exact', action='store_true', dest='exact',
help='Assume query is exact, with no spelling mistakes')
parser.set_defaults(exact=False)

args = parser.parse_args()

if 'nt' in os.name:
try:
import win_unicode_console
win_unicode_console.enable()
except ImportError:
warnings.warn('On Windows, encoding errors may arise when displaying the data.\n'
'If such errors occur, please install `win_unicode_consolde` via \n'
'the command `pip install win-unicode-console`.')

sotawhat.main(args)
69 changes: 36 additions & 33 deletions sotawhat.py → sotawhat/sotawhat.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,23 @@
import re
import sys
import os
import urllib.error
import urllib.request
import enchant
from nltk.tokenize import word_tokenize
from six.moves.html_parser import HTMLParser

import nltk
from nltk.tokenize import word_tokenize

try:
import enchant
_ENCHANT_AVAILABLE = True
except ImportError:
_ENCHANT_AVAILABLE = False

try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')

h = HTMLParser()

AUTHOR_TAG = '<a href="/search/?searchtype=author'
Expand Down Expand Up @@ -129,7 +141,7 @@ def has_number(sent):
return True
try:
value = int(token)
except:
except Exception:
continue
if (not is_citation_year(tokens, i)) and (not is_list_numer(tokens, i, value)):
return True
Expand Down Expand Up @@ -175,7 +187,7 @@ def extract_line(abstract, keyword, limit):
def get_report(paper, keyword):
if keyword in paper['abstract'].lower():
title = h.unescape(paper['title'])
headline = '{} ({} - {})\n'.format(title, paper['authors'][0], paper['date'])
headline = '{} ({} - {})\n'.format(title, h.unescape(paper['authors'][0]), paper['date'])
abstract = h.unescape(paper['abstract'])
extract, has_number = extract_line(abstract, keyword, 280 - len(headline))
if extract:
Expand Down Expand Up @@ -214,21 +226,28 @@ def txt2reports(txt, keyword, num_to_show):
return unshown, num_to_show, found


def get_papers(keyword, num_results=5):
def get_papers(keyword, num_results=5, args=None):
"""
If keyword is an English word, then search in CS category only to avoid papers from other categories, resulted from the ambiguity
"""

if keyword in set(['GAN', 'bpc']):
if keyword in {'GAN', 'bpc'}:
query_temp = 'https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term={}&terms-0-field=all&classification-computer_science=y&classification-physics_archives=all&date-filter_by=all_dates&date-year=&date-from_date=&date-to_date=&date-date_type=submitted_date&abstracts=show&size={}&order=-announced_date_first&start={}'
keyword = keyword.lower()
else:
keyword = keyword.lower()
d = enchant.Dict('en_US')
if d.check(keyword):
query_temp = 'https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term={}&terms-0-field=all&classification-computer_science=y&classification-physics_archives=all&date-filter_by=all_dates&date-year=&date-from_date=&date-to_date=&date-date_type=submitted_date&abstracts=show&size={}&order=-announced_date_first&start={}'
if _ENCHANT_AVAILABLE:
d = enchant.Dict('en_US')
if d.check(keyword):
query_temp = 'https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term={}&terms-0-field=all&classification-computer_science=y&classification-physics_archives=all&date-filter_by=all_dates&date-year=&date-from_date=&date-to_date=&date-date_type=submitted_date&abstracts=show&size={}&order=-announced_date_first&start={}'
else:
query_temp = 'https://arxiv.org/search/?searchtype=all&query={}&abstracts=show&size={}&order=-announced_date_first&start={}'
else:
query_temp = 'https://arxiv.org/search/?searchtype=all&query={}&abstracts=show&size={}&order=-announced_date_first&start={}'
if args.exact:
query_temp = 'https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term={}&terms-0-field=all&classification-computer_science=y&classification-physics_archives=all&date-filter_by=all_dates&date-year=&date-from_date=&date-to_date=&date-date_type=submitted_date&abstracts=show&size={}&order=-announced_date_first&start={}'
else:
query_temp = 'https://arxiv.org/search/?searchtype=all&query={}&abstracts=show&size={}&order=-announced_date_first&start={}'

keyword_q = keyword.replace(' ', '+')
page = 0
per_page = 200
Expand Down Expand Up @@ -263,28 +282,12 @@ def get_papers(keyword, num_results=5):
page += 1


def main():
if len(sys.argv) < 2:
raise ValueError('You must specify a keyword')
if len(sys.argv) > 3:
raise ValueError("Too many arguments")

keyword = sys.argv[1]

if len(sys.argv) == 3:
try:
num_results = int(sys.argv[2])
except:
print('The second argument must be an integer')
return
if num_results <= 0:
raise ValueError('You must choose to show a positive number of results')

else:
num_results = 5
def main(args):
keyword = str(args.query)
num_results = int(args.count)

get_papers(keyword, num_results)
if num_results <= 0:
raise ValueError('You must choose to show a positive number of results')

get_papers(keyword, num_results, args)

if __name__ == '__main__':
main()

0 comments on commit 2ba3cea

Please sign in to comment.