Skip to content

Commit

Permalink
Merge pull request #15 from akash-suresh/sudo
Browse files Browse the repository at this point in the history
Completed code. Yet to clean it up.
  • Loading branch information
sudarshansk committed Apr 2, 2017
2 parents eb2c0c6 + 1cb5ddc commit 08fb33a
Show file tree
Hide file tree
Showing 9 changed files with 238 additions and 172 deletions.
343 changes: 183 additions & 160 deletions LexChain/Boochain.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,172 +6,195 @@
import sys
reload(sys)
sys.setdefaultencoding('utf8')
sys.path.append('../Lexrank')

threshold = 0.6 #treshold for wup
jcnTreshold = 0.09 #jcn
pathTeshold = 0.1 #path
brown_ic = wordnet_ic.ic('ic-brown.dat') #load the brown corpus
lexical_chains = [] #empty list to hold all the chains
dictionary = {} #empty dictionart to hold the count of each word encountered
from summa.preprocessing.textcleaner import clean_text_by_sentences as clean


def findWholeWord(w):
return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search
def LexicalChain(fileName="amazon.txt", verbose=0):

#class Chain
class Chain():
def __init__(self, words, senses, count = 0):
self.words = set(words)
self.senses = set(senses)
dictionary[words[0]] = 1 #initialize counter

def addWord(self, word):

if(len(self.words.intersection([word])) > 0):
dictionary[word] += 1
else:
dictionary[word] = 1

self.words.add(word)


def addSense(self, sense):
self.senses.add(sense)

def getWords(self):
return self.words

def getSenses(self):
return self.getSenses

def incCount(self):
self.count += 1

def setScore(self, sc):
self.score = sc

def mfword(self):
maxfreq = 0
for word in self.getWords():
if dictionary[word] > maxfreq:
maxword = word
maxfreq = dictionary[word]
return maxword

def findWholeWord(w):
return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search

def add_word(word):
maximum = 0
maxJCN = 0
flag = 0
for chain in lexical_chains: #for all chains that are present
for synset in wn.synsets(word): #for all synsets of current word
for sense in chain.senses: #for all senses of the current word in current element of the current chain
similarity = sense.wup_similarity(synset) #using wup_similarity
#class Chain
class Chain():
def __init__(self, words, senses, count = 0):
self.words = set(words)
self.senses = set(senses)
dictionary[words[0]] = 1 #initialize counter
def addWord(self, word):

if(similarity >= maximum):
if similarity >= threshold:
#print word, synset, sense, sense.jcn_similarity(synset, brown_ic)
JCN = sense.jcn_similarity(synset, brown_ic) #using jcn_similarity
if JCN >= jcnTreshold:
if sense.path_similarity(synset) >= 0.2: #using path similarity
if JCN >= maxJCN:
maximum = similarity
maxJCN = JCN
maxChain = chain
flag = 1
if flag == 1:
maxChain.addWord(word)
maxChain.addSense(synset)
return

lexical_chains.append(Chain([word], wn.synsets(word)))

def count_words(summary):
count = 0
for line in summary:
count = count + len(line.split(' '))
return count
#fileName = raw_input("Enter file path + name, if file name is 'nlp.txt', type 'nlp' \n \n")
#n = raw_input("Enter number of sentences in summary.\n")
word_count=50
fileName = "amazon.txt"
print ("\n\n")
#fileName = "nlp.txt"
File = open(fileName) #open file
lines = File.read() #read all lines
#dec_lines = [line.decode('utf-8') for line in lines]

line_list = lines.split('. ')


is_noun = lambda x: True if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS') else False
nouns = [word for (word, pos) in nltk.pos_tag(nltk.word_tokenize(lines)) if is_noun(pos)] #extract all nouns

for word in nouns:
add_word(word)

#print all chains
for chain in lexical_chains:
chain_length = 0
dis_word = 0
for word in chain.getWords():
#print str(word + "(" + str(dictionary[word]) + ")") + ',',
chain_length = chain_length + dictionary[word]
dis_word = dis_word + 1
#print 'Length =' + str(chain_length)
hom = 1 - (dis_word*1.0/chain_length)
#print 'Homogeneity =' + str(hom)
score = 1.0*chain_length*hom
#print 'Score =' + str(score)
chain.setScore(score)

print 'Sorted start '
lexical_chains.sort(key=lambda x: x.score, reverse=True)

for chain in lexical_chains:
if(chain.score>0.0):
for word in chain.getWords():
print str(word + "(" + str(dictionary[word]) + ")") + ',',
print 'Score=' + str(chain.score)

summary = []
line_flags = []
line_score=[]

for line in line_list:
line_flags.append(0)
line_score.append(0)

for chain in lexical_chains:
bigword = chain.mfword()
chain_score = chain.score
print '\nMF word ', bigword
for i in range(len(line_list)):
line=line_list[i]
if findWholeWord(bigword)(line)!=None:
#((line.find(' '+str(bigword)+' ')!=-1) or (line.find(' '+str(bigword)+'.')!=-1)):
if line_flags[i]==0:
#summary.append(line)
#print 'i ', count_words(summary)
line_flags[i] = 1
line_score[i] = chain_score
#print 'line_score ', line_score
#print 'line_flags ', line_flags

break
elif line_flags[i]==1:
line_score[i] = line_score[i] + chain.score
#print '\nline_score ', line_score
#print 'line_flags ', line_flags
if(len(self.words.intersection([word])) > 0):
dictionary[word] += 1
else:
dictionary[word] = 1

self.words.add(word)


'''
if(count_words(summary)>word_count):
break
'''

print len(summary)
print line_score

final_summary = ' '.join(summary)
#print final_summary
def addSense(self, sense):
self.senses.add(sense)

def getWords(self):
return self.words

def getSenses(self):
return self.getSenses

def incCount(self):
self.count += 1

def setScore(self, sc):
self.score = sc

def mfword(self):
maxfreq = 0
for word in self.getWords():
if dictionary[word] > maxfreq:
maxword = word
maxfreq = dictionary[word]
return maxword

def add_word(word):
maximum = 0
maxJCN = 0
flag = 0
for chain in lexical_chains: #for all chains that are present
for synset in wn.synsets(word): #for all synsets of current word
for sense in chain.senses: #for all senses of the current word in current element of the current chain
similarity = sense.wup_similarity(synset) #using wup_similarity

if(similarity >= maximum):
if similarity >= threshold:
#print word, synset, sense, sense.jcn_similarity(synset, brown_ic)
JCN = sense.jcn_similarity(synset, brown_ic) #using jcn_similarity
if JCN >= jcnTreshold:
if sense.path_similarity(synset) >= 0.2: #using path similarity
if JCN >= maxJCN:
maximum = similarity
maxJCN = JCN
maxChain = chain
flag = 1
if flag == 1:
maxChain.addWord(word)
maxChain.addSense(synset)
return

lexical_chains.append(Chain([word], wn.synsets(word)))


def count_words(summary):
count = 0
for line in summary:
count = count + len(line.split(' '))
return count
#fileName = raw_input("Enter file path + name, if file name is 'nlp.txt', type 'nlp' \n \n")
#n = raw_input("Enter number of sentences in summary.\n")

#fileName = "nlp.txt"
threshold = 0.6 #treshold for wup
jcnTreshold = 0.09 #jcn
pathTeshold = 0.1 #path
brown_ic = wordnet_ic.ic('ic-brown.dat') #load the brown corpus
lexical_chains = [] #empty list to hold all the chains
dictionary = {} #empty dictionart to hold the count of each word encountered
word_count=50
File = open(fileName) #open file
lines = File.read() #read all lines
#dec_lines = [line.decode('utf-8') for line in lines]
#print [clean_line.token for clean_line in clean_lines]

clean_lines = clean(lines)
line_list = [clean_line.text for clean_line in clean_lines]
is_noun = lambda x: True if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS') else False
nouns = [word for (word, pos) in nltk.pos_tag(nltk.word_tokenize(lines)) if is_noun(pos)] #extract all nouns


for word in nouns:
add_word(word)

#print all chains
for chain in lexical_chains:
chain_length = 0
dis_word = 0
for word in chain.getWords():
#print str(word + "(" + str(dictionary[word]) + ")") + ',',
chain_length = chain_length + dictionary[word]
dis_word = dis_word + 1
#print 'Length =' + str(chain_length)
hom = 1 - (dis_word*1.0/chain_length)
#print 'Homogeneity =' + str(hom)
score = 1.0*chain_length*hom
#print 'Score =' + str(score)
chain.setScore(score)

#print 'Sorted start '
lexical_chains.sort(key=lambda x: x.score, reverse=True)

if verbose==1:
for chain in lexical_chains:
if(chain.score>0.0):
for word in chain.getWords():
print str(word + "(" + str(dictionary[word]) + ")") + ',',
print 'Score=' + str(chain.score)

summary = []
line_flags = []
line_score=[]

for line in line_list:
line_flags.append(0)
line_score.append(0)

for chain in lexical_chains:

bigword = chain.mfword()
chain_score = chain.score
#print '\nMF word ', bigword
for i in range(len(line_list)):
line=line_list[i]
if findWholeWord(bigword)(line)!=None:
#((line.find(' '+str(bigword)+' ')!=-1) or (line.find(' '+str(bigword)+'.')!=-1)):
if line_flags[i]==0:
#summary.append(line)
#print 'i ', count_words(summary)
line_flags[i] = 1
line_score[i] = chain_score
#print 'line_score ', line_score
#print 'line_flags ', line_flags

break
#elif line_flags[i]==1:
#line_score[i] = line_score[i] + chain.score
#print '\nline_score ', line_score
#print 'line_flags ', line_flags


'''
if(count_words(summary)>word_count):
break
'''
tot_score = 0
for i in range(len(line_score)):
line_score[i] = line_score[i]+1

for score in line_score:
tot_score = tot_score + score

for i in range(len(line_score)):
line_score[i] = line_score[i]/tot_score

namscores = dict(zip([sentence.token for sentence in clean_lines],line_score))

#print namscores
#print len(summary)
#print line_score

#final_summary = ' '.join(summary)
#print final_summary
return namscores

#print LexicalChain(verbose=1)
Binary file added LexChain/Boochain.pyc
Binary file not shown.
27 changes: 26 additions & 1 deletion Lexrank/amazon.txt
Original file line number Diff line number Diff line change
@@ -1 +1,26 @@
No billionaire has had a better year than Amazon CEO Jeff Bezos. Bezos is the biggest dollar gainer on the 2017 World’s Billionaires List, in a year when 56 percentage of billionaires saw their fortunes increase. Bezos' net worth jumped by 27.6 billion in the past year more than the total net worth of all but the 24 richest billionaires. Amazon stock climbed 67 percentage in the past year in part because of the success of its cloud computing unit, Amazon Web Services. With a net worth of 72.8 billion, Bezos is now the third richest billionaire in the world, behind Microsoft cofounder Bill Gates and Berkshire Hathaway Chairman and CEO Warren Buffett. Last year, Bezos cracked the ranks of the top 10 richest billionaires in the world for the first time, ranking fifth on Forbes 2016 World’s Billionaires List. Now he is two spots higher in the ranks and nearly 30 billion richer. A decade ago, on the 2007 Billionaires List, he had a net worth of 4.4 billion.
A coalition of members of Congress announced
Wednesday that they plan to sue the Census Bureau in an effort to
force the agency to delete illegal aliens from its count in 1990.
Some 40 members of the House joined the Federation for American
Immigration Reform in announcing that the suit would be filed
Thursday in U.S. District Court in Pittsburgh, spokesmen said at a
news conference here.
The group contends that including the estimated 2 million or
more illegal aliens in the national head count, which is used to
distribute seats in the House of Representatives, will cause unfair
shifts of seats from one state to another.
Census officials say they are required to count everyone by the
U.S. Constitution, which does not mention citizenship but only
instructs that the House apportionment be based on the ``whole
number of persons'' residing in the various states. That approach
was upheld by a federal court in a similar suit, brought by the
same immigration reform group, before the 1980 Census.
Nonetheless, Dan Stein of the immigration reform federation
contended that illegal aliens should not be allowed to be part of
determining the political structure of the United States.
Rep. Tom Ridge, R-Pa., said the Census Bureau should actually
count everyone but that it should develop a method to determine how
many people are illegally in the country, and them deduct that
number from the figures used for reapportioning Congress.
Rep. Jan Meyers, R-Kan., suggested including a question on the
Census form asking whether respondents are U.S. citizerns.
Loading

0 comments on commit 08fb33a

Please sign in to comment.