Skip to content

Commit

Permalink
student projects: reports
Browse files Browse the repository at this point in the history
  • Loading branch information
mdeff committed Feb 7, 2018
1 parent 20dccd8 commit fb59444
Show file tree
Hide file tree
Showing 727 changed files with 440,567 additions and 0 deletions.
Empty file.
Empty file.
4,144 changes: 4,144 additions & 0 deletions projects/reports/amazon_products/project main notebook.ipynb

Large diffs are not rendered by default.

Empty file.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
3,000 changes: 3,000 additions & 0 deletions projects/reports/arab_springs/A_Network_Tour_of_the_Arab_Spring.ipynb

Large diffs are not rendered by default.

Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Binary file added projects/reports/arab_springs/cnn.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file.
94 changes: 94 additions & 0 deletions projects/reports/arab_springs/english_dictionary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
###############################################################
########################## IMPORTS ############################
###############################################################
import numpy as np
import string
from nltk.tokenize import TweetTokenizer
import pickle


###############################################################
########################## FUNCTIONS ##########################
###############################################################
def tokenize(text):
'''
Tokenize string using nltk tweet tokenizer
param text: string
return: list of tokens
'''
tknzr = TweetTokenizer()
return tknzr.tokenize(text)

def correct(sentence, contrac_dict={}):
'''
replace contractions in sentence and remove punctuation
param sentence: string
param contrac_dict: dictionary, english contraction
return string, corrected sentece
'''
tokens = tokenize(sentence)
new_tokens = []
for token in tokens:
if token in contrac_dict:
new_tokens.append(contrac_dict[token])
if len(token)>1:
new_tokens.append(''.join(c for c in token if c not in string.punctuation))
return ' '.join(new_tokens)


###############################################################
############################ MAIN #############################
###############################################################
def main():

######## Upload dictionaries ######
###################################
#Define Paths
#BASE = '../data/dictionaries/'
BASE = ''

## English Dicionary
english_words = np.asarray([line.rstrip('\n').lower() for line in open(BASE+'english_words.txt')])
idx = np.arange(int(len(english_words)/3))
english_dictionary = dict(zip(english_words[3*idx+1], english_words[3*idx+2]))
freq = dict(zip(english_words[3*idx+1], english_words[3*idx+2]))

## English contractions (#ignore)
contractions = np.asarray([line.rstrip('\n').lower() for line in open(BASE+'contractions.txt')])
idx = np.arange(int(len(contractions)/2))
contractions_dict = dict(zip(contractions[2*idx], contractions[2*idx+1]))

## Acronyms
acronyms = np.asarray([line.rstrip('\n').lower() for line in open(BASE+'netlingo_acronyms.txt')])
idx = np.arange(int(len(acronyms)/2))
acronyms_dict = dict(zip(acronyms[2*idx], acronyms[2*idx+1]))

#Remove multi explications
for key in acronyms_dict:
acronyms_dict[key] = acronyms_dict[key].split('/ ')[0]

#correct descriptions
for key in acronyms_dict:
acronyms_dict[key] = correct(acronyms_dict[key])

## Smileys
smileys = np.asarray([line.rstrip('\n').lower() for line in open(BASE+'netlingo_smileys.txt')])
idx = np.arange(int(len(smileys)/2))
smileys_dict = dict(zip(smileys[2*idx], smileys[2*idx+1]))

#Remove multi explications
for key in smileys_dict:
smileys_dict[key] = smileys_dict[key].split('- ')[0]

## Final Dicionary
freq_dict = { k:v for k, v in english_dictionary.items()}

final_dict = { k:k for k, v in english_dictionary.items()}
final_dict.update(acronyms_dict)
final_dict.update(smileys_dict)

# Save Dictionary
pickle.dump([final_dict, freq_dict], open('../data/dictionaries.p', 'wb'))

if __name__ == "__main__":
main()
Loading

0 comments on commit fb59444

Please sign in to comment.