Merge branch 'master' into master

chiphuyen · Nov 9, 2018 · 9332e0e · 9332e0e
2 parents f2be3b5 + 806308a
commit 9332e0e
Show file tree

Hide file tree

Showing 5 changed files with 119 additions and 58 deletions.
diff --git a/README.md b/README.md
@@ -2,40 +2,63 @@
 
 To see the live demo of this project, you can try in this [link](https://sotawhat.herokuapp.com/#/)
 
-This script runs using Python 3.
+This script runs using Python 3. It requires ``nltk``, ``six``, and ``pyspellchecker``. To install it as a Python package, follow the following steps:
 
-First, install the required packages. This script only requires ``nltk`` and ``PyEnchant``.
 
+Step 1: clone this repo, and go inside that repo:
 ```bash
-$ pip install -r requirements.txt
+$ git clone [HTTPS or SSH linnk to this repo]
+$ cd sotawhat
 ```
-
-If you run the error that the package ``punkt`` doesn't exist, download it by going into your Python environment and running:
+Step 2: install using pip
 
 ```bash
-$ python3
+$ pip3 install .
+```
+
+On Windows, due to encoding errors, the script may cause issues when run on the command line. It is
+recommended to use `pip install win-unicode-console --upgrade` prior to launching the script. If you get
+UnicodeEncodingError, you *must* install the above.
+
+In MacOS, you can get the SSL error
+
+```
+[nltk_data] Error loading punkt: <urlopen error [SSL:
+[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
+[nltk_data]     unable to get local issuer certificate (_ssl.c:1045)>
+```
 
->>> import nltk
->>> nltk.download('punkt')
+this will be fixed by reinstalling certificates
+```shell
+$ /Applications/Python\ 3.x/Install\ Certificates.command
 ```
 
+# Usage
+This project adds the `sotawhat` script for you to run globally on Terminal or commandline.
+
 To query for a certain keyword, run:
 
 ```bash
-$ python3 sotawhat.py "[keyword]" [number of results]
+$ sotawhat [keyword] [number of results]
 ```
 
 For example:
 
 ```bash
-$ python3 sotawhat.py "perplexity" 10
+$ sotawhat perplexity 10
+```
+
+or 
+
+```bash
+$ sotawhat language model 10
 ```
 
 If you don't specify the number of results, by default, the script returns 5 results. Each result contains the title of the paper with author and published date, a summary of the abstract, and link to the paper.
 
 We've found that this script works well with keywords that are:
 + a model (e.g. transformer, wavenet, ...)
 + a dataset (e.g. wikitext, imagenet, ...)
-+ a task (e.g. 'language model', 'machine translation', 'fuzzing', ...)
++ a task (e.g. language model, machine translation, fuzzing, ...)
 + a metric (e.g. BLEU, perplexity, ...)
 + random stuff
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,3 @@
-PyEnchant
-nltk
+nltk
+six
+pyspellchecker
diff --git a/setup.py b/setup.py
@@ -0,0 +1,17 @@
+from setuptools import setup, find_packages
+import sotawhat
+
+setup(
+    name='sotawhat',
+    version=str(sotawhat.__VERSION__),
+    packages=find_packages(),
+    description='arxiv-sanity query script',
+    long_description=str('SOTAwhat is a script to query Arxiv for the latest '
+                         'abstracts and extract summaries from them. '),
+    url='https://huyenchip.com/2018/10/04/sotawhat.html',
+    license="",
+    install_requires=['six', 'nltk', 'pyspellchecker'],
+    entry_points={
+        'console_scripts': ['sotawhat=sotawhat.sotawhat:main'],
+    }
+)
diff --git a/sotawhat/__init__.py b/sotawhat/__init__.py
@@ -0,0 +1 @@
+__VERSION__ = '0.0.1'
diff --git a/sotawhat.py → sotawhat/sotawhat.py b/sotawhat.py → sotawhat/sotawhat.py
@@ -1,49 +1,56 @@
-import datetime
+import os
 import re
 import sys
-import urllib
+import urllib.error
 import urllib.request
 
+import nltk
+from nltk.tokenize import word_tokenize
 from six.moves.html_parser import HTMLParser
-h = HTMLParser()
+from spellchecker import SpellChecker
 
-import enchant
-from nltk.tokenize import word_tokenize
+try:
+    nltk.data.find('tokenizers/punkt')
+except LookupError:
+    nltk.download('punkt')
+
+h = HTMLParser()
 
 AUTHOR_TAG = '<a href="/search/?searchtype=author'
 TITLE_TAG = '<p class="title is-5 mathjax">'
 ABSTRACT_TAG = '<span class="abstract-full has-text-grey-dark mathjax"'
 DATE_TAG = '<p class="is-size-7"><span class="has-text-black-bis has-text-weight-semibold">Submitted</span>'
 
+
 def get_authors(lines, i):
     authors = []
     while True:
         if not lines[i].startswith(AUTHOR_TAG):
             break
         idx = lines[i].find('>')
         if lines[i].endswith(','):
-            authors.append(lines[i][idx + 1 : -5])
+            authors.append(lines[i][idx + 1: -5])
         else:
-            authors.append(lines[i][idx + 1 : -4])
+            authors.append(lines[i][idx + 1: -4])
         i += 1
     return authors, i
 
-def get_next_result(lines, start):
 
-    '''
+def get_next_result(lines, start):
+    """
     Extract paper from the xml file obtained from arxiv search.
-    
+
     Each paper is a dict that contains:
     + 'title': str
     + 'pdf_link': str
     + 'main_page': str
     + 'authors': []
     + 'abstract': str
-    '''
+    """
 
     result = {}
     idx = lines[start + 3][10:].find('"')
-    result['main_page'] = lines[start + 3][9:10+idx]
+    result['main_page'] = lines[start + 3][9:10 + idx]
     idx = lines[start + 4][23:].find('"')
     result['pdf'] = lines[start + 4][22: 23 + idx] + '.pdf'
 
@@ -57,7 +64,7 @@ def get_next_result(lines, start):
     title = title.replace('</span>', '')
     result['title'] = title
 
-    authors, start = get_authors(lines, start + 5) # orig: add 8
+    authors, start = get_authors(lines, start + 5)  # orig: add 8
 
     while not lines[start].strip().startswith(ABSTRACT_TAG):
         start += 1
@@ -72,34 +79,38 @@ def get_next_result(lines, start):
         start += 1
 
     idx = lines[start].find('</span> ')
-    end = lines[start][idx : ].find(';')
+    end = lines[start][idx:].find(';')
 
     result['date'] = lines[start][idx + 8: idx + end]
 
     return result, start
 
+
 def clean_empty_lines(lines):
-   cleaned = []
-   for line in lines:
-      line = line.strip()
-      if line:
-         cleaned.append(line)
-   return cleaned
+    cleaned = []
+    for line in lines:
+        line = line.strip()
+        if line:
+            cleaned.append(line)
+    return cleaned
+
 
 def is_float(token):
     return re.match("^\d+?\.\d+?$", token) is not None
 
+
 def is_citation_year(tokens, i):
     if len(tokens[i]) != 4:
         return False
     if re.match(r'[12][0-9]{3}', tokens[i]) is None:
         return False
     if i == 0 or i == len(tokens) - 1:
         return False
-    if (tokens[i - 1] == ',' or tokens[i - 1] == '(')and tokens[i + 1] == ')':
+    if (tokens[i - 1] == ',' or tokens[i - 1] == '(') and tokens[i + 1] == ')':
         return True
     return False
 
+
 def is_list_numer(tokens, i, value):
     if value < 1 or value > 4:
         return False
@@ -116,9 +127,9 @@ def has_number(sent):
     for i, token in enumerate(tokens):
         if token.endswith('\\'):
             token = token[:-2]
-        if token.endswith('x'): # sometimes people write numbers as 1.7x
+        if token.endswith('x'):  # sometimes people write numbers as 1.7x
             token = token[:-1]
-        if token.startswith('x'): # sometimes people write numbers as x1.7
+        if token.startswith('x'):  # sometimes people write numbers as x1.7
             token = token[1:]
         if token.startswith('$') and token.endswith('$'):
             token = token[1:-1]
@@ -133,13 +144,14 @@ def has_number(sent):
 
     return False
 
+
 def contains_sota(sent):
     return 'state-of-the-art' in sent or 'state of the art' in sent or 'SOTA' in sent
 
+
 def extract_line(abstract, keyword, limit):
     lines = []
     numbered_lines = []
-    has_sota = False
     kw_mentioned = False
     abstract = abstract.replace("et. al", "et al.")
     sentences = abstract.split('. ')
@@ -150,7 +162,6 @@ def extract_line(abstract, keyword, limit):
             if has_number(sent):
                 numbered_lines.append(sent)
             elif contains_sota(sent):
-                has_sota = True
                 numbered_lines.append(sent)
             else:
                 kw_sentences.append(sent)
@@ -167,7 +178,8 @@ def extract_line(abstract, keyword, limit):
     if len(numbered_lines) > 0:
         return '. '.join(numbered_lines), True
     return '. '.join(lines[-2:]), False
-
+
+
 def get_report(paper, keyword):
     if keyword in paper['abstract'].lower():
         title = h.unescape(paper['title'])
@@ -179,6 +191,7 @@ def get_report(paper, keyword):
             return report, has_number
     return '', False
 
+
 def txt2reports(txt, keyword, num_to_show):
     found = False
     txt = ''.join(chr(c) for c in txt)
@@ -208,19 +221,20 @@ def txt2reports(txt, keyword, num_to_show):
             break
     return unshown, num_to_show, found
 
-def get_papers(keyword, num_results=5):
 
-    '''
+def get_papers(keyword, num_results=5):
+    """
     If keyword is an English word, then search in CS category only to avoid papers from other categories, resulted from the ambiguity
-    '''
+    """
 
     if keyword in set(['GAN', 'bpc']):
         query_temp = 'https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term={}&terms-0-field=all&classification-computer_science=y&classification-physics_archives=all&date-filter_by=all_dates&date-year=&date-from_date=&date-to_date=&date-date_type=submitted_date&abstracts=show&size={}&order=-announced_date_first&start={}'
         keyword = keyword.lower()
     else:
         keyword = keyword.lower()
-        d = enchant.Dict('en_US')
-        if d.check(keyword):
+        words = keyword.split()
+        d = SpellChecker()
+        if not d.unknown(words):
             query_temp = 'https://arxiv.org/search/advanced?advanced=&terms-0-operator=AND&terms-0-term={}&terms-0-field=all&classification-computer_science=y&classification-physics_archives=all&date-filter_by=all_dates&date-year=&date-from_date=&date-to_date=&date-date_type=submitted_date&abstracts=show&size={}&order=-announced_date_first&start={}'
         else:
             query_temp = 'https://arxiv.org/search/?searchtype=all&query={}&abstracts=show&size={}&order=-announced_date_first&start={}'
@@ -257,27 +271,32 @@ def get_papers(keyword, num_results=5):
             all_unshown.extend(unshown)
         page += 1
 
+
 def main():
+    if 'nt' in os.name:
+        try:
+            import win_unicode_console
+            win_unicode_console.enable()
+        except ImportError:
+            warnings.warn('On Windows, encoding errors may arise when displaying the data.\n'
+                          'If such errors occur, please install `win_unicode_consolde` via \n'
+                          'the command `pip install win-unicode-console`.')
+
     if len(sys.argv) < 2:
         raise ValueError('You must specify a keyword')
-    if len(sys.argv) > 3:
-        raise ValueError("Too many arguments")
 
-    keyword = sys.argv[1]
+    try:
+        num_results = int(sys.argv[-1])
+        assert num_results > 0, 'You must choose to show a positive number of results'
+        keyword = ' '.join(sys.argv[1:-1])
 
-    if len(sys.argv) == 3:
-        try:
-            num_results = int(sys.argv[2])
-        except:
-            print('The second argument must be an integer')
-            return
-        if num_results <= 0:
-            raise ValueError('You must choose to show a positive number of results')
-
-    else:
+    except ValueError:
+        keyword = ' '.join(sys.argv[1:])
         num_results = 5
 
+
     get_papers(keyword, num_results)
 
+
 if __name__ == '__main__':
-    main()
+    main()