From 2339377128f2ab889aff13a11d4db99af2c9f789 Mon Sep 17 00:00:00 2001 From: Sudarshan Date: Mon, 3 Apr 2017 00:54:14 +0530 Subject: [PATCH 1/4] Fixed absolute addressing --- Lexrank/summa/pagerank_weighted.pyc | Bin 3656 -> 3656 bytes Lexrank/summa/textrank.py | 2 +- Lexrank/summa/textrank.pyc | Bin 2836 -> 2573 bytes 3 files changed, 1 insertion(+), 1 deletion(-) diff --git a/Lexrank/summa/pagerank_weighted.pyc b/Lexrank/summa/pagerank_weighted.pyc index 6db70016dc9e39afa15641edd0fef5dcb9fc4427..543674e3a0e17a9af12d0d5af96b1fae5f5f1a3a 100644 GIT binary patch delta 15 WcmX>hb3%rV`7X$0T^ delta 15 WcmX>hb3%rV`7eV^2dA|Pwyy_?KN4kT;ayq9Gj6B+s@C$euQN1GJKYjU)4bM7KX+i6ZG E0C=BzKL7v# From 143be5d5c6aa748bdd57844ae9c838734ae20314 Mon Sep 17 00:00:00 2001 From: Sudarshan Date: Mon, 3 Apr 2017 01:55:57 +0530 Subject: [PATCH 2/4] Boochain perfectly alright! --- LexChain/Boochain.py | 344 ++++++++++++++++++----------------- LexChain/Boochain.pyc | Bin 0 -> 5567 bytes Lexrank/summa/summarizer.py | 4 +- Lexrank/summa/summarizer.pyc | Bin 4362 -> 4342 bytes Lexrank/summa/textrank.py | 5 +- Lexrank/summa/textrank.pyc | Bin 2573 -> 2573 bytes 6 files changed, 185 insertions(+), 168 deletions(-) create mode 100644 LexChain/Boochain.pyc diff --git a/LexChain/Boochain.py b/LexChain/Boochain.py index a33f1e1..2073200 100644 --- a/LexChain/Boochain.py +++ b/LexChain/Boochain.py @@ -6,172 +6,186 @@ import sys reload(sys) sys.setdefaultencoding('utf8') +sys.path.append('../Lexrank') -threshold = 0.6 #treshold for wup -jcnTreshold = 0.09 #jcn -pathTeshold = 0.1 #path -brown_ic = wordnet_ic.ic('ic-brown.dat') #load the brown corpus -lexical_chains = [] #empty list to hold all the chains -dictionary = {} #empty dictionart to hold the count of each word encountered - - -def findWholeWord(w): - return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search - -#class Chain -class Chain(): - def __init__(self, words, senses, count = 0): - self.words = set(words) - self.senses = set(senses) - dictionary[words[0]] = 1 #initialize counter - - def addWord(self, word): - - if(len(self.words.intersection([word])) > 0): - dictionary[word] += 1 - else: - dictionary[word] = 1 - - self.words.add(word) - - - def addSense(self, sense): - self.senses.add(sense) - - def getWords(self): - return self.words - - def getSenses(self): - return self.getSenses - - def incCount(self): - self.count += 1 - - def setScore(self, sc): - self.score = sc - - def mfword(self): - maxfreq = 0 - for word in self.getWords(): - if dictionary[word] > maxfreq: - maxword = word - maxfreq = dictionary[word] - return maxword - - -def add_word(word): - maximum = 0 - maxJCN = 0 - flag = 0 - for chain in lexical_chains: #for all chains that are present - for synset in wn.synsets(word): #for all synsets of current word - for sense in chain.senses: #for all senses of the current word in current element of the current chain - similarity = sense.wup_similarity(synset) #using wup_similarity - - if(similarity >= maximum): - if similarity >= threshold: - #print word, synset, sense, sense.jcn_similarity(synset, brown_ic) - JCN = sense.jcn_similarity(synset, brown_ic) #using jcn_similarity - if JCN >= jcnTreshold: - if sense.path_similarity(synset) >= 0.2: #using path similarity - if JCN >= maxJCN: - maximum = similarity - maxJCN = JCN - maxChain = chain - flag = 1 - if flag == 1: - maxChain.addWord(word) - maxChain.addSense(synset) - return - - lexical_chains.append(Chain([word], wn.synsets(word))) - -def count_words(summary): - count = 0 - for line in summary: - count = count + len(line.split(' ')) - return count -#fileName = raw_input("Enter file path + name, if file name is 'nlp.txt', type 'nlp' \n \n") -#n = raw_input("Enter number of sentences in summary.\n") -word_count=50 -fileName = "amazon.txt" -print ("\n\n") -#fileName = "nlp.txt" -File = open(fileName) #open file -lines = File.read() #read all lines -#dec_lines = [line.decode('utf-8') for line in lines] - -line_list = lines.split('. ') - - -is_noun = lambda x: True if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS') else False -nouns = [word for (word, pos) in nltk.pos_tag(nltk.word_tokenize(lines)) if is_noun(pos)] #extract all nouns - -for word in nouns: - add_word(word) - -#print all chains -for chain in lexical_chains: - chain_length = 0 - dis_word = 0 - for word in chain.getWords(): - #print str(word + "(" + str(dictionary[word]) + ")") + ',', - chain_length = chain_length + dictionary[word] - dis_word = dis_word + 1 - #print 'Length =' + str(chain_length) - hom = 1 - (dis_word*1.0/chain_length) - #print 'Homogeneity =' + str(hom) - score = 1.0*chain_length*hom - #print 'Score =' + str(score) - chain.setScore(score) - -print 'Sorted start ' -lexical_chains.sort(key=lambda x: x.score, reverse=True) - -for chain in lexical_chains: - if(chain.score>0.0): - for word in chain.getWords(): - print str(word + "(" + str(dictionary[word]) + ")") + ',', - print 'Score=' + str(chain.score) - -summary = [] -line_flags = [] -line_score=[] - -for line in line_list: - line_flags.append(0) - line_score.append(0) - -for chain in lexical_chains: - bigword = chain.mfword() - chain_score = chain.score - print '\nMF word ', bigword - for i in range(len(line_list)): - line=line_list[i] - if findWholeWord(bigword)(line)!=None: - #((line.find(' '+str(bigword)+' ')!=-1) or (line.find(' '+str(bigword)+'.')!=-1)): - if line_flags[i]==0: - #summary.append(line) - #print 'i ', count_words(summary) - line_flags[i] = 1 - line_score[i] = chain_score - #print 'line_score ', line_score - #print 'line_flags ', line_flags - - break - elif line_flags[i]==1: - line_score[i] = line_score[i] + chain.score - #print '\nline_score ', line_score - #print 'line_flags ', line_flags - +from summa.preprocessing.textcleaner import clean_text_by_sentences as clean -''' - if(count_words(summary)>word_count): - break -''' -print len(summary) -print line_score +def LexicalChain(fileName="amazon.txt", verbose=0): -final_summary = ' '.join(summary) -#print final_summary \ No newline at end of file + + def findWholeWord(w): + return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search + + #class Chain + class Chain(): + def __init__(self, words, senses, count = 0): + self.words = set(words) + self.senses = set(senses) + dictionary[words[0]] = 1 #initialize counter + + def addWord(self, word): + + if(len(self.words.intersection([word])) > 0): + dictionary[word] += 1 + else: + dictionary[word] = 1 + + self.words.add(word) + + + def addSense(self, sense): + self.senses.add(sense) + + def getWords(self): + return self.words + + def getSenses(self): + return self.getSenses + + def incCount(self): + self.count += 1 + + def setScore(self, sc): + self.score = sc + + def mfword(self): + maxfreq = 0 + for word in self.getWords(): + if dictionary[word] > maxfreq: + maxword = word + maxfreq = dictionary[word] + return maxword + + def add_word(word): + maximum = 0 + maxJCN = 0 + flag = 0 + for chain in lexical_chains: #for all chains that are present + for synset in wn.synsets(word): #for all synsets of current word + for sense in chain.senses: #for all senses of the current word in current element of the current chain + similarity = sense.wup_similarity(synset) #using wup_similarity + + if(similarity >= maximum): + if similarity >= threshold: + #print word, synset, sense, sense.jcn_similarity(synset, brown_ic) + JCN = sense.jcn_similarity(synset, brown_ic) #using jcn_similarity + if JCN >= jcnTreshold: + if sense.path_similarity(synset) >= 0.2: #using path similarity + if JCN >= maxJCN: + maximum = similarity + maxJCN = JCN + maxChain = chain + flag = 1 + if flag == 1: + maxChain.addWord(word) + maxChain.addSense(synset) + return + + lexical_chains.append(Chain([word], wn.synsets(word))) + + + def count_words(summary): + count = 0 + for line in summary: + count = count + len(line.split(' ')) + return count + #fileName = raw_input("Enter file path + name, if file name is 'nlp.txt', type 'nlp' \n \n") + #n = raw_input("Enter number of sentences in summary.\n") + + #fileName = "nlp.txt" + threshold = 0.6 #treshold for wup + jcnTreshold = 0.09 #jcn + pathTeshold = 0.1 #path + brown_ic = wordnet_ic.ic('ic-brown.dat') #load the brown corpus + lexical_chains = [] #empty list to hold all the chains + dictionary = {} #empty dictionart to hold the count of each word encountered + word_count=50 + File = open(fileName) #open file + lines = File.read() #read all lines + #dec_lines = [line.decode('utf-8') for line in lines] + + line_list = lines.split('. ') + clean_lines = clean(lines) + + is_noun = lambda x: True if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS') else False + nouns = [word for (word, pos) in nltk.pos_tag(nltk.word_tokenize(lines)) if is_noun(pos)] #extract all nouns + + + for word in nouns: + add_word(word) + + #print all chains + for chain in lexical_chains: + chain_length = 0 + dis_word = 0 + for word in chain.getWords(): + #print str(word + "(" + str(dictionary[word]) + ")") + ',', + chain_length = chain_length + dictionary[word] + dis_word = dis_word + 1 + #print 'Length =' + str(chain_length) + hom = 1 - (dis_word*1.0/chain_length) + #print 'Homogeneity =' + str(hom) + score = 1.0*chain_length*hom + #print 'Score =' + str(score) + chain.setScore(score) + + #print 'Sorted start ' + lexical_chains.sort(key=lambda x: x.score, reverse=True) + + if verbose==1: + for chain in lexical_chains: + if(chain.score>0.0): + for word in chain.getWords(): + print str(word + "(" + str(dictionary[word]) + ")") + ',', + print 'Score=' + str(chain.score) + + summary = [] + line_flags = [] + line_score=[] + + for line in line_list: + line_flags.append(0) + line_score.append(0) + + for chain in lexical_chains: + if chain.score>0.0: + bigword = chain.mfword() + chain_score = chain.score + #print '\nMF word ', bigword + for i in range(len(line_list)): + line=line_list[i] + if findWholeWord(bigword)(line)!=None: + #((line.find(' '+str(bigword)+' ')!=-1) or (line.find(' '+str(bigword)+'.')!=-1)): + if line_flags[i]==0: + #summary.append(line) + #print 'i ', count_words(summary) + line_flags[i] = 1 + line_score[i] = chain_score + #print 'line_score ', line_score + #print 'line_flags ', line_flags + + break + elif line_flags[i]==1: + line_score[i] = line_score[i] + chain.score + #print '\nline_score ', line_score + #print 'line_flags ', line_flags + + + ''' + if(count_words(summary)>word_count): + break + + ''' + + namscores = dict(zip([sentence.token for sentence in clean_lines],line_score)) + + #print namscores + #print len(summary) + #print line_score + + #final_summary = ' '.join(summary) + #print final_summary + return namscores + +print LexicalChain(verbose=1) \ No newline at end of file diff --git a/LexChain/Boochain.pyc b/LexChain/Boochain.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d81da96f83fe45ebe9e08c875c8f1862284e66f8 GIT binary patch literal 5567 zcmbVQOKcn06}@jreMph|Sh8g`ZX%~`N+*tN)Ise6vX$71odg9Nj0`9htPVz;ku(%J zBxgo3<&Z16zeP7KP@q|K(?z@LqMNp>qKh^#8X!P6T^HSSS)f3+?K$_6lG-k8h2%Wm z_rLeN`+Jl7SGoAp_x|=?O$MJF{$IzF{s~2ZpOJ>hhNT^87->4PkrTtdoYwQQk+*e> z9g&TZ;k;4V7?p5L!cjRiV(?cVmmq&sDM-C2!HCwm+^94rq*;=UGL{>Y$JlpLf^j9L zBq&HQEkTjhLy>Q9pfaI-GZK`PI3qz>0?e6`$KZ07+^E|Uv2{1>b zKT1f~Dx;WN-MOZpzpv~R|N4ve&EjuYc0T+WKfhl|OBfKuo>Vy+7z5N53vhgeFmGgx#p-H@w=eAH``# z3%r;NYYRZNMTT*{2;K_yFlK^DIt4y4WprB4eHh+ z$dq`9bJ5H?Mdt_rf}jA*fuINqfIVs}0*|n28R1E|bc`qXqT#^frQx$PMn=Hc8fU%C z>10C8k!5p7<%tqlp4QaHtPfY*wC)m_)9t$xW&|XC6*Et{uR2Z4SH9EF!GQZ^Y)WJL z-;JpdD)1}_2%N_$Ki^SYKv*Dm4qkMfV0!u>COV4V2#UsopZUsKnJMmN06+d0xS?LyVnQGa4536 zc}XKDScEl#QySjMmb|DbaZ+Yo)`Pg(DO93cwVy=>^D$#>8;#F7B8|;PN_TY zS6C2a_Z1Y!Xe;%U@amq$6e>BV%O#4o5BFtS#^?)yD(d?i0Gn z)y+RLN9q2KSRaMl)y-crXX*Z2dU>hmFzP4FUV2FV;1xC1>yJqfKb+TyBrBw$O{727 zVHi9@&<}BrLpRczl!!ieN5C!HtK`B*xi zNas`Otnjzqxq@dw)dFMo97)DaPxDdeJN&&?CI*|S6NJe1l4NBnAi}~)p}{JrsF_l6 zs~YYGS=o~nYqU&bTt-+;F&@UMSLp%#UzXbV#t{$q+g=(qqlTYE*?~*Xs@WD&vy<)P zT#Uz*O`%|(cCvSoy;K_wt$w>5#(^Has@;ESb=2H%>L{rEJ9nxYVwertV9WH(hG*r| zy~s7GA;&CgnBq*bghe!EIo8UjD5TCp9PnrA>D_~so|+8pK(DplIoKn%RaS9w^gn=hzB3^Nwy*?M3&vw1p*GSqI_4u>nU zXnn*5kk;L`8&Rf%?9uTykQ}HVR@fR*e2M_bd|Q(l6+PWV)kbGXSTx9LC70>wN8Wb}Bdx>SCL7Kv6UKO7(I5P(2j*eh5?3+WO0KGIJG z<9ir6%1Z|Xt&eI63)Nv5xw`qhJb*Qh$qx|`M&$>(O|nIM`;H@>EgEX~JPmJkbDHCD ztmwm3)A^CCci-n_V|6ngS|m~?*yV(DFN?J~aLP^S8H$3d2qrJg_lwe-U=BNvp`DLp z9V8n@WDcXwp6{1PgPiM?WUddxg#DJJSGF7fO4hM!9rr<`y?U8Msta2>6vyDWu^6rX zH>TUR+=b+hOXy9LvE;7l-Z;tN%PQJ5xx+1VKARr03PI;_6`jXFE~4}II=f~a7f}qb z>#_gdTb;WGzx5dI18`4jIi=;abSrdNtDB3`pOM~_)N!R%_)Ty6M5Xf=v)=v4^81JX zx+@${6m&OzcX3?Rq}+X-L%8Kuw4)1XxM4W= zbbuaPq8|LDL4zJXwEy+xuPlN^GFI_ zns{a0>NF43r6DNfRHr3eDBAleuk+$Av7lMFbRSkf4(vOJBy6<&AUlJ0oM;ek`}+;P zwQB`37Y2q$-P*qp0(hSQg>l^H#H6NRuQcr_%98;F<72p4MnF3L;R3M!CaUtZlOcO zmAUKFxqde7uj%yOUi_?{#m0wvGrrdosbbTtH^~a;M`!Kv#nnk0x0*+zj2?@9`)FRo MY^UjT5^qL#KZgb|n*aa+ delta 272 zcmXv|u}T9$6r8tvx4GNoVjx9GX{uEIfQ5pAvj{1YI;j={2O)%8yIm~SMhlz$0vj!? zByE1dkFd8Btn)4?ycuTR8;19L{xnkZr_f*14>joAw*e%K8%F)gWC6z`5d^$~C9K&h zOqOTxEw~mcftA2+h(Pcetc1%NO@>gitP%vEGy>(VP5+Np{VAWrau3@aVJ1K}J2`?A z2oheyX;e9kgh}uKZX>d%txMq2CbSr*{Ohc4STF9(wx&kK&&~_AdI^EAE{6E@$) aMI3jZl^l^4{r#c)PU?!o^Xs_jJstta3^IcN diff --git a/Lexrank/summa/textrank.py b/Lexrank/summa/textrank.py index eba99a2..2f7189a 100644 --- a/Lexrank/summa/textrank.py +++ b/Lexrank/summa/textrank.py @@ -2,11 +2,13 @@ import sys sys.path.append('../') - import sys, getopt from summarizer import summarize from keywords import keywords +#sys.path.append('../../LexChain') +#from Boochain import LexicalChain + # Types of summarization SENTENCE = 0 WORD = 1 @@ -59,6 +61,7 @@ def usage(): def textrank(text, summarize_by=SENTENCE, ratio=0.2, words=None): + #namscores = LexicalChain() if summarize_by == SENTENCE: return summarize(text, ratio, words) else: diff --git a/Lexrank/summa/textrank.pyc b/Lexrank/summa/textrank.pyc index d9481724a1a11d14c36bba76ec3c8a3851ec916e..3b99747bb257531eafa9f45b7f4edfb5e79606b3 100644 GIT binary patch delta 74 zcmeAb=@nsP{>;m@FmfZ?Z5Bqs&5u~7F)>uza6DpkpRCS# dUYtpSQGii^jg66wnNyREk&{J;gOiho5dgo?4uAjv delta 74 zcmeAb=@nsP{>;m@%YP%=Z5BrU&5u~7F)> Date: Mon, 3 Apr 2017 02:02:28 +0530 Subject: [PATCH 3/4] Namma scores is entered into a textrank --- LexChain/Boochain.py | 2 +- LexChain/Boochain.pyc | Bin 5567 -> 5457 bytes Lexrank/summa/textrank.py | 11 ++++++----- Lexrank/summa/textrank.pyc | Bin 2573 -> 2732 bytes 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/LexChain/Boochain.py b/LexChain/Boochain.py index 2073200..5c1500b 100644 --- a/LexChain/Boochain.py +++ b/LexChain/Boochain.py @@ -188,4 +188,4 @@ def count_words(summary): #print final_summary return namscores -print LexicalChain(verbose=1) \ No newline at end of file +#print LexicalChain(verbose=1) \ No newline at end of file diff --git a/LexChain/Boochain.pyc b/LexChain/Boochain.pyc index d81da96f83fe45ebe9e08c875c8f1862284e66f8..bc5593208a097e41fcab93e0c6c6c3a2d1bdf66d 100644 GIT binary patch delta 2577 zcmZWrO>7&-6@IfzF8{@UQ4}RvPTYo4RH|-mOKD{1$4Z<-nz*X~N`yKD%UxNds3p0( zs+o2dYT(q84o#8Fy=V_X&p{Btm;51z9E!qdkwcM7(L+zop+J#ik$i8+7)ERCJk7j$ z^X7Zsy!Ysj9{tO5P5)0O*?#pOFSlv(PsiUUF}ye5s=Z#Y;$$W04UKlD$P%}e?2*U1)&s)*$9EVi@e#+ku3eH;6)wEPOL` ztkJF@OQ${a|+V*sz$@lDa_DlhJp-j-pH|C zpz|J##PDm{9{rq7BG_^$1ko7^vr6V@v_ioYbrBR{dqJJd(g`9-Q?&xKAV<~|9aX{{ z4IdNRn6oBJr65b?nJ~|DOiMwYN?}1cOHxo6-#ja}acwt67S80ETB|@lV38&Qa5+W$ z8oB4R(O?YEW_XMzLspUYBjnx{qjhMG?n7K=h(4^+WR<9b*#s;?typEM;IB1Jm21N< zfVQGcl|$aY_T2EUz_;xXa~&3yEGb#0QJsPWSXiWRnt~E_6-i8Ca8!O^HN32Ck3JhC z{^SI%jbBn&<#kY{DqjiKD@X^{V#1MAFcQkz4!#xe(wqGK@ z%oGGkV9HWBOTjF4r&VIDvy+5(bC1Y=8ns{Kjiu;}Ybm)aZ`9r~hDSvW#;X>?FwCswZ%GT;%Nk+fkI~B1_r{ z$rmFFLYGct@lu`ld={SF+cl>!F|(|SLbUsbj$db#KKBo+)Al=kr{%sT{~TLNG@Bi# z<2Rer^2{W$n&hu=M#lsMFj7eZj^CLww4MiElV6bvS5xor6xV z<#znn&)USA$uRU!@)sRk@n3U*l>~#Q1T{e&+UX zIZaec3Jdsr$+f+`e$P^O^z2cm-Rd>ld##S+`CO^~-o!xF6*9fa&$Jh320+3P88I*9d*b|i@8*_7Y5~e|dTw$? z8|Jf6+>w>S<4bH`=gcuxrE`V@I%j1c!K#UCw=DBVvR}BkuJU9No8zX@cYX6|tW3T@ z6{_dECYOrJ9)%hI3%O~_e;1a;oU9e+8Ya_XuCutzr+7JN`Fk%-SYmdWfTrp{a;|uGW zl=~jPw>N+wJrSusQkzR84A5qXtdfREi?p5vB-~vC16tIu0nrPS4+OwEOPh z&@K?Sb2G+n(2PZZ>>-FNLg delta 2683 zcmZ`*O>7(25uUgF6Un7WiIgZ>mMqzEO3RKE*RUO@@|7=FnbxC}6aJ69fSgB)8t0Q+q911TD~_{pOKz1+*o1 zhcj>H&6_vB{Cw%*@!8m4a+&w8e|f!4qfZRK=kW%Q9?X9K@_nKkQu_b2#pk)AP(Lj(Bqk%<=s0%z!dtk+AooEoJ z4q$?u1Pzkp5Drda?^XS2{^V|z^+^Gs80)mFe27Xj*znz8qqpU#EU0eVdu2bp(^4C+ zjQC^V-|~BRy=JGa#zAbkVW-{NYPL689WPKBEID9xdS1)FXUMxf-)Xjcu{)l6XKLvb zOLV-B>a?~xue&)=0qY@=G~c{|Z8nqNINqB6Mf?*nt?wn~jflMCOFY(kYaD+YnPbz7E!$QfQr0JM9? z*yW?mCY#!9&axc4l0+WgHk-lk(7f@~v7;(RksvTCu;FZM)yuFk!`0QugsL0YH&b&L z_yDZ4>pXL=vO#n_<@$kZJZB6vA}z-mhONSz_%!w=+-P&`z^#K^(0@+N;iCDmYJsmH zInPGh^rf*{WeIe^=8|~qSaxnr=O$*eeDx}mvZDW4I=Q&+D%Ki2g{A+u>wrF;E(w-D zF)yEh;M4y@nD1EA@1{>=4*=VDg#Jf*ta57Kj<4OxzcoFVc^ay(X3mQ<`X`w(ZuWj= zaYhM>;9WR*GfsYy31qzz`VW~yr+K41!r*airEwX%X}cBP_T9cb1YC_4MsC!vW;5cP z?qttA!6B9l3=T6e%J^pTD1$14Jj2MIyq$(-egG`1Y2&?2~x*rakS zur7wS>C3qj^IVqFvW0u42Ny4QD8Yl4?JuY{kN$ezQ#Rzt>MuC!v~!o zjRi1E;R)hcW@s0-t~KU)Ym_0haO|#>As1zNJd`2Z|A^{9c_Be1NKMTQY{r3>BAZGB zgdfUmigLq~e@1m2+r=ouAW89LLrX=+-y=&P6QOL=BL9~n+@W1#pUd{K4l^aWMlXu+ znURUc!XXiaQL`^ zd4{`#0)FI`rXx8T^$gj!E2+29u++2Sp0s zAyp(*;uByBp0&msG%$2jqRK9gR8tgTBhqMsA?j)~M*b-armzTXVR!9W90aqp8li>Z zf+bpgx&OgqxuIJ@8&+j_|MEx>a{W{M)caovJattjHA5@#0Xo21BToaEF-6^q$*d1q z3wi?V8B9|&jei4NsaJpXulzBQeG=%fC%?dLJSo46g&$~l;#+gv+$0YrGjV#TPJ{Sv z%aPCOk0(xFS!D@s4gYDnrbQaLpyw-jk(qpknPY+SB{zI|oL9JM;2+SsD-QzGXC{|K zS+^z+>Yqnp{JNi(l5>t^PN{(Z4fxeb1B z6M#fY*uswGg>B`3Ff4 Io=vC!4S!JWlK=n! diff --git a/Lexrank/summa/textrank.py b/Lexrank/summa/textrank.py index 2f7189a..2afff29 100644 --- a/Lexrank/summa/textrank.py +++ b/Lexrank/summa/textrank.py @@ -6,8 +6,8 @@ from summarizer import summarize from keywords import keywords -#sys.path.append('../../LexChain') -#from Boochain import LexicalChain +sys.path.append('../../LexChain') +from Boochain import LexicalChain # Types of summarization SENTENCE = 0 @@ -60,8 +60,9 @@ def usage(): print help_text -def textrank(text, summarize_by=SENTENCE, ratio=0.2, words=None): - #namscores = LexicalChain() +def textrank(text, path, summarize_by=SENTENCE, ratio=0.2, words=None): + namscores = LexicalChain(fileName=path) + print namscores if summarize_by == SENTENCE: return summarize(text, ratio, words) else: @@ -74,7 +75,7 @@ def main(): with open(path) as file: text = file.read() - print textrank(text, summarize_by, ratio, words) + print textrank(text, path, summarize_by, ratio, words) if __name__ == "__main__": diff --git a/Lexrank/summa/textrank.pyc b/Lexrank/summa/textrank.pyc index 3b99747bb257531eafa9f45b7f4edfb5e79606b3..1dda07a241e3ad6f33ee1604e52ea7f1f2eea360 100644 GIT binary patch delta 686 zcmYk3zi!k(5XQf;@9f+AlRJ(C5J!SSB;rUYAlei}BoYyV1NMqTmm*l+4Y_cC!r29d zQHo1T5xIh%j*h2DLBS*N0yMk;GYb+}p3gJm^?twcepvjn)bW2cs{Ml>FBAN|O8ngB zNWYzT`ll8+CLDV(Wy}d=025#oVqC$X!gYv&g5jbHqi_ZcYA{s{>M%7|Heh)Hmju(m znU6seX2HE`!Cr!C;*E#d1#Am*fF|rf=tV7N7@mK>zmYC*wz+wi?t50u(Q^nmaj?`lTz5eu3u8RjrnahAf2HfFDcUdy$mJ?gk`!ez}9_{5G zGICJ&ka@^~+(%x*+($3t?EjkoM=uhmZdPGE9IlRRHy+zaT(rEm(kh$#iDZ;(54M+^MuztK0=OsZ4chdG26@JLGWOFN*!j_5o867?s)PLhVSz zq5H@CI;WXXOi%2@B{LfveUm$pY^N7RmPD?II3QVnI2>8MTs#eL7t2b~Ua_uLI&E1Z zf6#rF$p-NWZ)|j_c%^P^x#=HHr->ZTzqy@n|C!#^f}=YkGPIVnb1ZtT_^ck^TH#uz Vy+$oo4nLs4R~rGTQd=oir{9}#ZL0tP delta 497 zcmYjMyGjE=6g`t&H#3`ut0ofN7%VI{l~&dPg@{OmNfBMO2+j(k#2B|sAtbFum@lvt z6a{<#C8dRxKj7R^Aq)4+eVudmK9?U#q4QnvACIR;=lJ_v^EOP}y-dQ+iJC*4h4K({ zs4T`nETA}IA4=FVhr}$Hl0oG#b`Te!@^(~2G7nY2RR)76Bt?t>)&s4!qvGUi`~863 zdW`FjY^6)S|FJ{;lhe&q{m9t|>i7G&(y5aBFXhKI-Sil0$W1b`G9uw{&7b4Go06nTnvH$=8 From 1cb5ddca63a03f10200f46cdff8f87d5ec6c95b8 Mon Sep 17 00:00:00 2001 From: Sudarshan Date: Mon, 3 Apr 2017 02:50:52 +0530 Subject: [PATCH 4/4] Completed code. Cleanup yet to be done. --- LexChain/Boochain.py | 57 ++++++++++++++++------------ LexChain/Boochain.pyc | Bin 5457 -> 5573 bytes Lexrank/amazon.txt | 27 ++++++++++++- Lexrank/summa/pagerank_weighted.py | 16 +++++++- Lexrank/summa/pagerank_weighted.pyc | Bin 3656 -> 3691 bytes Lexrank/summa/summarizer.py | 8 ++-- Lexrank/summa/summarizer.pyc | Bin 4342 -> 4337 bytes Lexrank/summa/textrank.py | 4 +- Lexrank/summa/textrank.pyc | Bin 2732 -> 2728 bytes 9 files changed, 80 insertions(+), 32 deletions(-) diff --git a/LexChain/Boochain.py b/LexChain/Boochain.py index 5c1500b..41ceeb3 100644 --- a/LexChain/Boochain.py +++ b/LexChain/Boochain.py @@ -104,10 +104,10 @@ def count_words(summary): File = open(fileName) #open file lines = File.read() #read all lines #dec_lines = [line.decode('utf-8') for line in lines] + #print [clean_line.token for clean_line in clean_lines] - line_list = lines.split('. ') clean_lines = clean(lines) - + line_list = [clean_line.text for clean_line in clean_lines] is_noun = lambda x: True if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS') else False nouns = [word for (word, pos) in nltk.pos_tag(nltk.word_tokenize(lines)) if is_noun(pos)] #extract all nouns @@ -149,34 +149,43 @@ def count_words(summary): line_score.append(0) for chain in lexical_chains: - if chain.score>0.0: - bigword = chain.mfword() - chain_score = chain.score - #print '\nMF word ', bigword - for i in range(len(line_list)): - line=line_list[i] - if findWholeWord(bigword)(line)!=None: - #((line.find(' '+str(bigword)+' ')!=-1) or (line.find(' '+str(bigword)+'.')!=-1)): - if line_flags[i]==0: - #summary.append(line) - #print 'i ', count_words(summary) - line_flags[i] = 1 - line_score[i] = chain_score - #print 'line_score ', line_score - #print 'line_flags ', line_flags - - break - elif line_flags[i]==1: - line_score[i] = line_score[i] + chain.score - #print '\nline_score ', line_score - #print 'line_flags ', line_flags - + + bigword = chain.mfword() + chain_score = chain.score + #print '\nMF word ', bigword + for i in range(len(line_list)): + line=line_list[i] + if findWholeWord(bigword)(line)!=None: + #((line.find(' '+str(bigword)+' ')!=-1) or (line.find(' '+str(bigword)+'.')!=-1)): + if line_flags[i]==0: + #summary.append(line) + #print 'i ', count_words(summary) + line_flags[i] = 1 + line_score[i] = chain_score + #print 'line_score ', line_score + #print 'line_flags ', line_flags + + break + #elif line_flags[i]==1: + #line_score[i] = line_score[i] + chain.score + #print '\nline_score ', line_score + #print 'line_flags ', line_flags + ''' if(count_words(summary)>word_count): break ''' + tot_score = 0 + for i in range(len(line_score)): + line_score[i] = line_score[i]+1 + + for score in line_score: + tot_score = tot_score + score + + for i in range(len(line_score)): + line_score[i] = line_score[i]/tot_score namscores = dict(zip([sentence.token for sentence in clean_lines],line_score)) diff --git a/LexChain/Boochain.pyc b/LexChain/Boochain.pyc index bc5593208a097e41fcab93e0c6c6c3a2d1bdf66d..74740219a7efa5cee1575e5c3f787a42e9f6c189 100644 GIT binary patch delta 1002 zcmZuwJ#Q015PiF6oA`WZpY8LVoeu*QR+5GSgiy+eAP^Fe;zW@|5obc2$l>e*U9l?{ z<|9Oc<|G=*P|~3M0t){C8Y-GploUu2bj+?}J|v{Qz1^Le_ukCxewrUGOz(fT9A&O%SaX>O6cS<1eu#U(!oAg!403>L>ZwCUzQPFV}pR!>ZAx)IH=WSMXhP+ ziV9p=)nJrJb zX#}y0SQm0~oo69t(ed>4JTS;}$dx2H2Qi1$R{iL0CM+D~oR6#;XX&-v-)(Pp>1hCQ zJBicJLlKaH~RPYdhIl-N|vPG zs?BW1VzVr*xR>U+#qzwtOjhTPVKOsQvU04!JZ`fhoqK#?s#PMCyLZxU_ce>sAMU+h D>qN_p delta 887 zcmZWnL2DC16#iy5Nt@jyyGeGl*|ydyDzO!BLM`=Hs28Eu!=loPbfRqM2k|WQB93;TJE=^sOuu*ne}LzR&$GHa>ZZ zg>-Dkf}ozNo?X-s6D@EiX|xJo*?0;S=v)Q} z^pT{xuPd^ly#RI%^ba^u(FnS>p_c16*O!p3Elav`Pgj@MqB-LfIF^<2tQ3p|^-q9= ziw-zh7;SU1Fm@4pS^{YjCmU)CatfVB_OKT9CL^x-sA&}BH14#5!=E`}O$<0a>#K8z zx1CSSUSFbw9xt%fZ9nX#7i+KCO!~gId8SBf-JM?hL3d{m$JQx=(q4OzF4f$V}R_X+ED2Wri#Y zwtg(c2kK66<0cy^xNzlC{VOUiT-c4!d!IsJ-kG`go_FrKcfRDl=j_<;lv~{VU@wcI z2_Q*lf=-N1e^~(QKxASthdp2u*a1qFDMS`R6Pp4dJr=S%0-~EnSsIYBF_kzw2B;X> z@CoA>{Dm~(>tw0&3(*9^IQ$rj(U9K6aWMEaRwz|Q+>8ie4AYT|!GAt0D$egX` zqkGd-l@7K9dZ$I`{PXOf1ud#+*SkZMsU+R;0Ca21KyL+$-4)PNrg3jAknG)p#XCMn zaI@O1J!q5J`pIahSLTxXX}&(f%`st%w3swb znd*g=Gc2(`veH6Ksn7OTQQUtT&l%#jdhg`Vvpp}vaFpRV!!ZJFx!!D7rN@>%wxbT5 zvzPQdoFbF!a>L7VHP67VG+8?+@ciCowdj_uS^B)8M()d;?xZDHFxc+Mhilta*=r2E zE9!pYjdh07_tc-n^SpL;f5llsChJLqrswOq`pwG2#O#DyRpP^k#33xXVRvRq)BFxzzkgk zY<#SGH~kTUN_Q>I;0uHTBVb)(2n+@Cu|Nw5HLUCZ9F#MZh&A|xI{Lp58t^qv zSXlXv*g&Mi*D+5{4l&Vxh0)I6x(h4M5T`gGk|@S$^w*NwCZg21Mh%Z>loWP>U4e## zZ{Q*R1dXUWH6o!CTZqzJC(=NUQjVujvr4rnkwS8jh5US+zK@PVEu;jEoh8aEk~_4q zA<)f>$Q;jX^2S^=3A@(mp+>I%h+)F2sCKHLQ=`zkZc2TWm#kGHmcAc1_Ox#|kCE1s z1SXc;3HpaR!BV>w5Tn3m&?4f1R@)DHiOX+32t$6KMz_7S5e#~2MqjjDel^b`P>=PB zdaYLtdO0+eW9+*pNrTLwB#P1$Sz(JY7KAI#$|>QnIK#lrJud6bs?W}o)Ktc@ Ha+duYUXYC6 diff --git a/Lexrank/summa/summarizer.py b/Lexrank/summa/summarizer.py index 1aa2d62..3eac92f 100644 --- a/Lexrank/summa/summarizer.py +++ b/Lexrank/summa/summarizer.py @@ -88,11 +88,11 @@ def _extract_most_important_sentences(sentences, ratio, words): return _get_sentences_with_word_count(sentences, words) -def summarize(text, ratio=0.2, words=None, language="english", split=False, scores=False): +def summarize(text, namscores, ratio=0.2, words=None, language="english", split=False, scores=False): # Gets a list of processed sentences. sentences = _clean_text_by_sentences(text, language) - namscores=[] + #print namscores # Creates the graph and calculates the similarity coefficient for every pair of nodes. graph = _build_graph([sentence.token for sentence in sentences]) @@ -102,7 +102,9 @@ def summarize(text, ratio=0.2, words=None, language="english", split=False, scor _remove_unreachable_nodes(graph) # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score - pagerank_scores = _pagerank(graph) + pagerank_scores = _pagerank(graph, namscores) + + #print pagerank_scores # Adds the summa scores to the sentence objects. _add_scores_to_sentences(sentences, pagerank_scores) diff --git a/Lexrank/summa/summarizer.pyc b/Lexrank/summa/summarizer.pyc index 1bdcb25a850ae417ecf7dc97d3b69725c175ea34..44621da7f6bda700cd7960d11dd7cf3b5ba51397 100644 GIT binary patch delta 195 zcmeyS_)(FK`7pNFG?+*{DoI-vIbwffDllWfr+10iIIz$ ci>3=9m;KwRtuBuW?YM2;G7#Xse t7>e{5N|+dG7#M1Rg3JsxEDX&olO;Kpb8`SSF);Eo@h}Q+KFe{R5dcVW5gz~m delta 98 zcmZ1>x<-_Z`7G4T8|>CB3=9m;KwRtsBuW?YFHWEJ!+U3 yN*EcknHY);7)qEJY8V)5m>FtV7@C