-
Notifications
You must be signed in to change notification settings - Fork 31
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
727 changed files
with
440,567 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
Empty file.
4,144 changes: 4,144 additions & 0 deletions
4,144
projects/reports/amazon_products/project main notebook.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
Empty file.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
3,000 changes: 3,000 additions & 0 deletions
3,000
projects/reports/arab_springs/A_Network_Tour_of_the_Arab_Spring.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
############################################################### | ||
########################## IMPORTS ############################ | ||
############################################################### | ||
import numpy as np | ||
import string | ||
from nltk.tokenize import TweetTokenizer | ||
import pickle | ||
|
||
|
||
############################################################### | ||
########################## FUNCTIONS ########################## | ||
############################################################### | ||
def tokenize(text): | ||
''' | ||
Tokenize string using nltk tweet tokenizer | ||
param text: string | ||
return: list of tokens | ||
''' | ||
tknzr = TweetTokenizer() | ||
return tknzr.tokenize(text) | ||
|
||
def correct(sentence, contrac_dict={}): | ||
''' | ||
replace contractions in sentence and remove punctuation | ||
param sentence: string | ||
param contrac_dict: dictionary, english contraction | ||
return string, corrected sentece | ||
''' | ||
tokens = tokenize(sentence) | ||
new_tokens = [] | ||
for token in tokens: | ||
if token in contrac_dict: | ||
new_tokens.append(contrac_dict[token]) | ||
if len(token)>1: | ||
new_tokens.append(''.join(c for c in token if c not in string.punctuation)) | ||
return ' '.join(new_tokens) | ||
|
||
|
||
############################################################### | ||
############################ MAIN ############################# | ||
############################################################### | ||
def main(): | ||
|
||
######## Upload dictionaries ###### | ||
################################### | ||
#Define Paths | ||
#BASE = '../data/dictionaries/' | ||
BASE = '' | ||
|
||
## English Dicionary | ||
english_words = np.asarray([line.rstrip('\n').lower() for line in open(BASE+'english_words.txt')]) | ||
idx = np.arange(int(len(english_words)/3)) | ||
english_dictionary = dict(zip(english_words[3*idx+1], english_words[3*idx+2])) | ||
freq = dict(zip(english_words[3*idx+1], english_words[3*idx+2])) | ||
|
||
## English contractions (#ignore) | ||
contractions = np.asarray([line.rstrip('\n').lower() for line in open(BASE+'contractions.txt')]) | ||
idx = np.arange(int(len(contractions)/2)) | ||
contractions_dict = dict(zip(contractions[2*idx], contractions[2*idx+1])) | ||
|
||
## Acronyms | ||
acronyms = np.asarray([line.rstrip('\n').lower() for line in open(BASE+'netlingo_acronyms.txt')]) | ||
idx = np.arange(int(len(acronyms)/2)) | ||
acronyms_dict = dict(zip(acronyms[2*idx], acronyms[2*idx+1])) | ||
|
||
#Remove multi explications | ||
for key in acronyms_dict: | ||
acronyms_dict[key] = acronyms_dict[key].split('/ ')[0] | ||
|
||
#correct descriptions | ||
for key in acronyms_dict: | ||
acronyms_dict[key] = correct(acronyms_dict[key]) | ||
|
||
## Smileys | ||
smileys = np.asarray([line.rstrip('\n').lower() for line in open(BASE+'netlingo_smileys.txt')]) | ||
idx = np.arange(int(len(smileys)/2)) | ||
smileys_dict = dict(zip(smileys[2*idx], smileys[2*idx+1])) | ||
|
||
#Remove multi explications | ||
for key in smileys_dict: | ||
smileys_dict[key] = smileys_dict[key].split('- ')[0] | ||
|
||
## Final Dicionary | ||
freq_dict = { k:v for k, v in english_dictionary.items()} | ||
|
||
final_dict = { k:k for k, v in english_dictionary.items()} | ||
final_dict.update(acronyms_dict) | ||
final_dict.update(smileys_dict) | ||
|
||
# Save Dictionary | ||
pickle.dump([final_dict, freq_dict], open('../data/dictionaries.p', 'wb')) | ||
|
||
if __name__ == "__main__": | ||
main() |
Oops, something went wrong.